Source code for mlproblems.classification

# Copyright 2011 Hugo Larochelle. All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without modification, are
# permitted provided that the following conditions are met:
# 
#    1. Redistributions of source code must retain the above copyright notice, this list of
#       conditions and the following disclaimer.
# 
#    2. Redistributions in binary form must reproduce the above copyright notice, this list
#       of conditions and the following disclaimer in the documentation and/or other materials
#       provided with the distribution.
# 
# THIS SOFTWARE IS PROVIDED BY Hugo Larochelle ``AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Hugo Larochelle OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# 
# The views and conclusions contained in the software and documentation are those of the
# authors and should not be interpreted as representing official policies, either expressed
# or implied, of Hugo Larochelle.

"""
The ``mlproblems.classification`` module contains MLProblems specifically
for classification problems.

This module contains the following classes:

* ClassificationProblem:   Generates a classification problem.
* ClassificationFrom3DLabelingProblem:   Generates a classification from 3D labeling problem.
* ClassSubsetProblem:   Extracts examples from a subset of all classes.

"""

from generic import MLProblem
import numpy as np

[docs]class ClassificationProblem(MLProblem): """ Generates a classification problem. The data should be an iterator over input/target pairs. **Required metadata:** * ``'targets'``: The set of possible values for the target. **Defined metadata:** * ``'class_to_id'``: A dictionary mapping from elements in ``'targets'`` to a class id. """ def __init__(self, data=None, metadata={},call_setup=True): MLProblem.__init__(self,data,metadata) if call_setup: ClassificationProblem.setup(self) def __iter__(self): for input,target in self.data: if not self.class_to_id.has_key(target) : print target yield input,self.class_to_id[target] def setup(self): # Creating class (string) to id (integer) mapping self.class_to_id = {} current_id = 0 for target in self.metadata['targets']: self.class_to_id[target] = current_id current_id += 1 self.metadata['class_to_id'] = self.class_to_id def apply_on(self, new_data, new_metadata={}): if self.__source_mlproblem__ is not None: new_data = self.__source_mlproblem__.apply_on(new_data,new_metadata) new_metadata = {} # new_data should already contain the new_metadata, since it is an mlproblem new_problem = ClassificationProblem(new_data,new_metadata,call_setup=False) new_problem.metadata['class_to_id'] = self.metadata['class_to_id'] new_problem.class_to_id = self.class_to_id return new_problem
[docs]class ClassificationFrom3DLabelingProblem(MLProblem): """ Generates a classification from a 3D labeling problem. The data should be an iterator over input/target pairs (tuple) of 3D numpy arrays. This MLProblem will generate from them a regular classification problem, where the generated input will be a 3D patch extracted around some given voxel, and the target will be the label associated with the center voxel. Option ``n_examples_per_labeling`` determines how many patches will be created from the data. Option ``proportion_per_label`` determines the proportion of patches per label that will be extracted from the data. Option ``patch_sizes`` determines the dimension of the patch taken. Each dimensions is the radius around the center voxel. Option ``filter_based_on_input`` determines a threshold on the center voxel, to filter output certain patches. If None (default) no filter is applied. Option ``seed`` determines the seed that will be used to shuffle the patches. **Required metadata:** * ``'targets'``: The set of possible values for the target. """ def __init__(self, data = None, metadata = {}, call_setup = True, n_examples_per_labeling = 100, proportion_per_label = None, patch_sizes = [4, 4, 4], filter_based_on_input=None, seed = 25): MLProblem.__init__(self, data, metadata) self.n_examples_per_labeling = n_examples_per_labeling self.proportion_per_label = proportion_per_label self.patch_sizes = patch_sizes self.seed = seed self.filter_based_on_input = filter_based_on_input self.rng = np.random.mtrand.RandomState(self.seed) # Verify if proportion_per_label total equal 1 if self.proportion_per_label is not None: total = 0 for x in self.proportion_per_label: total = total + x if total != 1: raise ValueError('The value of proportion_per_label total must be equal to 1.') # Default proportion_per_label is not set if self.proportion_per_label is None: self.proportion_per_label = [0.5, 0.5] if call_setup: ClassificationFrom3DLabelingProblem.setup(self) def __iter__(self): def get_patch(position, input, target): # Generate vector patch_sizes for each randoms positions # Slice version pos_start_x = max(position[0] - self.patch_sizes[0],0) pos_start_y = max(position[1] - self.patch_sizes[1],0) pos_start_z = max(position[2] - self.patch_sizes[2],0) pos_end_x = position[0] + self.patch_sizes[0] pos_end_y = position[1] + self.patch_sizes[1] pos_end_z = position[2] + self.patch_sizes[2] dimension_x = (self.patch_sizes[0] * 2) + 1 dimension_y = (self.patch_sizes[1] * 2) + 1 dimension_z = (self.patch_sizes[2] * 2) + 1 ret_patch = np.zeros((dimension_x, dimension_y, dimension_z), dtype=float) #input_patch = input[pos_start_x:pos_end_x, pos_start_y:pos_end_y, pos_start_z:pos_end_z] input_patch = input[pos_start_x:pos_end_x + 1, pos_start_y:pos_end_y + 1, pos_start_z:pos_end_z + 1] offset_x = max(-(position[0] - self.patch_sizes[0]), 0) offset_y = max(-(position[1] - self.patch_sizes[1]), 0) offset_z = max(-(position[2] - self.patch_sizes[2]), 0) end_x = offset_x + input_patch.shape[0] end_y = offset_y + input_patch.shape[1] end_z = offset_z + input_patch.shape[2] ret_patch[offset_x:end_x, offset_y:end_y, offset_z:end_z] = input_patch patch = (ret_patch.flatten(), target[position[0], position[1], position[2]]) return patch import itertools for data, position in itertools.izip(self.data, self.position_lst): for list_item in position: for tuple_item in list_item: yield get_patch(tuple_item, data[0], data[1]) def __len__(self): return self.n_examples_per_labeling * len(self.data) def setup(self): if self.proportion_per_label is None: self.proportion_per_label = {} for t in self.metadata['targets']: self.proportion_per_label[t] = 1./len(self.metadata['targets']) # Generate positions for each pair (input, labeling), in random order position_lst = [] pos_patch_sizes_vector = [] for input, target in self.data: position_lst_brain = [] n_treated_labels = 0 tot_examples = 0 for t in self.metadata['targets']: if n_treated_labels != len(self.metadata['targets']) - 1: n_examples_for_this_label = int(self.n_examples_per_labeling * self.proportion_per_label[t]) else: n_examples_for_this_label = self.n_examples_per_labeling - tot_examples tot_examples += n_examples_for_this_label # Get all position of the current label if self.filter_based_on_input is not None: idx = np.nonzero((target == t)*self.filter_based_on_input(input)) else: idx = np.nonzero(target == t) # Create a list equal to the length of idx and shuffle its position idx_position_lst = range(len(idx[0])) self.rng.shuffle(idx_position_lst) # Append position [x, y, z] position_lst_brain.append([ (idx[0][id], idx[1][id], idx[2][id]) for id in idx_position_lst[:n_examples_for_this_label] ]) n_treated_labels += 1 # Re-Shuffle self.rng.shuffle(position_lst_brain) position_lst += [position_lst_brain] self.position_lst = position_lst def apply_on(self, new_data, new_metadata={}): if self.__source_mlproblem__ is not None: new_data = self.__source_mlproblem__.apply_on(new_data,new_metadata) new_metadata = {} # new_data should already contain the new_metadata, since it is an mlproblem new_problem = ClassificationFrom3DLabelingProblem(new_data, new_metadata, n_examples_per_labeling = self.n_examples_per_labeling, proportion_per_label = self.proportion_per_label, filter_based_on_input = self.filter_based_on_input, patch_sizes = self.patch_sizes, seed = self.seed) return new_problem
[docs]class ClassSubsetProblem(MLProblem): """ Extracts examples in a dataset belonging to some subset of classes. Option ``subset`` gives the set of class symbols that should be included. The metadata ``'class_to_id'`` that maps symbols to IDs is required (it is assumed that the targets have already been processed by this mapping, see ClassificationProblem). Option ``include_class`` determines whether to put the class ID in the example or only yield the input. **Required metadata:** * ``'class_to_id'`` **Defined metadata:** * ``'class_to_id'`` * ``'targets'`` """ def __init__(self, data=None, metadata={},call_setup=True, subset=[], # Subset of classes to include include_class=True # Whether to include the class field ): MLProblem.__init__(self,data,metadata) self.subset=subset self.include_class = include_class self.__length__ = None if 'class_subset_length' in self.metadata: # Gives a chance to set length through metadata self.__length__ = self.metadata['class_subset_length'] del self.metadata['class_subset_length'] # So that it isn't passed to subsequent mlproblems else: # Since len(data) won't give the right answer, figure out what the length is by an exhaustive count parent_ids = set([]) parent_class_to_id = self.metadata['class_to_id'] for c in self.subset: parent_ids.add(parent_class_to_id[c]) self.__length__ = 0 for input,target in self.data: if target in parent_ids: self.__length__+=1 if call_setup: ClassSubsetProblem.setup(self) def __iter__(self): for input,target in self.data: if target in self.parent_id_to_id: if self.include_class: yield input,self.parent_id_to_id[target] else: yield input def setup(self): self.class_to_id = {} self.parent_id_to_id = {} self.targets = set([]) parent_class_to_id = self.metadata['class_to_id'] id = 0 for c in self.subset: self.class_to_id[c] = id self.parent_id_to_id[parent_class_to_id[c]]=id self.targets.add(c) id+=1 self.metadata['targets'] = self.targets self.metadata['class_to_id'] = self.class_to_id def apply_on(self, new_data, new_metadata={}): if self.__source_mlproblem__ is not None: new_data = self.__source_mlproblem__.apply_on(new_data,new_metadata) new_metadata = {} # new_data should already contain the new_metadata, since it is an mlproblem new_problem = ClassSubsetProblem(new_data,new_metadata,call_setup=False,subset=self.subset, include_class=self.include_class) new_problem.targets = self.targets new_problem.class_to_id = self.class_to_id new_problem.parent_id_to_id = self.parent_id_to_id new_problem.metadata['targets'] = self.targets new_problem.metadata['class_to_id'] = self.class_to_id return new_problem