""" fontTools.misc.classifyTools.py -- tools for classifying things. """ class Classifier(object): """ Main Classifier object, used to classify things into similar sets. """ def __init__(self, sort=True): self._things = set() # set of all things known so far self._sets = [] # list of class sets produced so far self._mapping = {} # map from things to their class set self._dirty = False self._sort = sort def add(self, set_of_things): """ Add a set to the classifier. Any iterable is accepted. """ if not set_of_things: return self._dirty = True things, sets, mapping = self._things, self._sets, self._mapping s = set(set_of_things) intersection = s.intersection(things) # existing things s.difference_update(intersection) # new things difference = s del s # Add new class for new things if difference: things.update(difference) sets.append(difference) for thing in difference: mapping[thing] = difference del difference while intersection: # Take one item and process the old class it belongs to old_class = mapping[next(iter(intersection))] old_class_intersection = old_class.intersection(intersection) # Update old class to remove items from new set old_class.difference_update(old_class_intersection) # Remove processed items from todo list intersection.difference_update(old_class_intersection) # Add new class for the intersection with old class sets.append(old_class_intersection) for thing in old_class_intersection: mapping[thing] = old_class_intersection del old_class_intersection def update(self, list_of_sets): """ Add a a list of sets to the classifier. Any iterable of iterables is accepted. """ for s in list_of_sets: self.add(s) def _process(self): if not self._dirty: return # Do any deferred processing sets = self._sets self._sets = [s for s in sets if s] if self._sort: self._sets = sorted(self._sets, key=lambda s: (-len(s), sorted(s))) self._dirty = False # Output methods def getThings(self): """Returns the set of all things known so far. The return value belongs to the Classifier object and should NOT be modified while the classifier is still in use. """ self._process() return self._things def getMapping(self): """Returns the mapping from things to their class set. The return value belongs to the Classifier object and should NOT be modified while the classifier is still in use. """ self._process() return self._mapping def getClasses(self): """Returns the list of class sets. The return value belongs to the Classifier object and should NOT be modified while the classifier is still in use. """ self._process() return self._sets def classify(list_of_sets, sort=True): """ Takes a iterable of iterables (list of sets from here on; but any iterable works.), and returns the smallest list of sets such that each set, is either a subset, or is disjoint from, each of the input sets. In other words, this function classifies all the things present in any of the input sets, into similar classes, based on which sets things are a member of. If sort=True, return class sets are sorted by decreasing size and their natural sort order within each class size. Otherwise, class sets are returned in the order that they were identified, which is generally not significant. >>> classify([]) == ([], {}) True >>> classify([[]]) == ([], {}) True >>> classify([[], []]) == ([], {}) True >>> classify([[1]]) == ([{1}], {1: {1}}) True >>> classify([[1,2]]) == ([{1, 2}], {1: {1, 2}, 2: {1, 2}}) True >>> classify([[1],[2]]) == ([{1}, {2}], {1: {1}, 2: {2}}) True >>> classify([[1,2],[2]]) == ([{1}, {2}], {1: {1}, 2: {2}}) True >>> classify([[1,2],[2,4]]) == ([{1}, {2}, {4}], {1: {1}, 2: {2}, 4: {4}}) True >>> classify([[1,2],[2,4,5]]) == ( ... [{4, 5}, {1}, {2}], {1: {1}, 2: {2}, 4: {4, 5}, 5: {4, 5}}) True >>> classify([[1,2],[2,4,5]], sort=False) == ( ... [{1}, {4, 5}, {2}], {1: {1}, 2: {2}, 4: {4, 5}, 5: {4, 5}}) True >>> classify([[1,2,9],[2,4,5]], sort=False) == ( ... [{1, 9}, {4, 5}, {2}], {1: {1, 9}, 2: {2}, 4: {4, 5}, 5: {4, 5}, ... 9: {1, 9}}) True >>> classify([[1,2,9,15],[2,4,5]], sort=False) == ( ... [{1, 9, 15}, {4, 5}, {2}], {1: {1, 9, 15}, 2: {2}, 4: {4, 5}, ... 5: {4, 5}, 9: {1, 9, 15}, 15: {1, 9, 15}}) True >>> classes, mapping = classify([[1,2,9,15],[2,4,5],[15,5]], sort=False) >>> set([frozenset(c) for c in classes]) == set( ... [frozenset(s) for s in ({1, 9}, {4}, {2}, {5}, {15})]) True >>> mapping == {1: {1, 9}, 2: {2}, 4: {4}, 5: {5}, 9: {1, 9}, 15: {15}} True """ classifier = Classifier(sort=sort) classifier.update(list_of_sets) return classifier.getClasses(), classifier.getMapping() if __name__ == "__main__": import sys, doctest sys.exit(doctest.testmod(optionflags=doctest.ELLIPSIS).failed)