classifyTools.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. """ fontTools.misc.classifyTools.py -- tools for classifying things.
  2. """
  3. class Classifier(object):
  4. """
  5. Main Classifier object, used to classify things into similar sets.
  6. """
  7. def __init__(self, sort=True):
  8. self._things = set() # set of all things known so far
  9. self._sets = [] # list of class sets produced so far
  10. self._mapping = {} # map from things to their class set
  11. self._dirty = False
  12. self._sort = sort
  13. def add(self, set_of_things):
  14. """
  15. Add a set to the classifier. Any iterable is accepted.
  16. """
  17. if not set_of_things:
  18. return
  19. self._dirty = True
  20. things, sets, mapping = self._things, self._sets, self._mapping
  21. s = set(set_of_things)
  22. intersection = s.intersection(things) # existing things
  23. s.difference_update(intersection) # new things
  24. difference = s
  25. del s
  26. # Add new class for new things
  27. if difference:
  28. things.update(difference)
  29. sets.append(difference)
  30. for thing in difference:
  31. mapping[thing] = difference
  32. del difference
  33. while intersection:
  34. # Take one item and process the old class it belongs to
  35. old_class = mapping[next(iter(intersection))]
  36. old_class_intersection = old_class.intersection(intersection)
  37. # Update old class to remove items from new set
  38. old_class.difference_update(old_class_intersection)
  39. # Remove processed items from todo list
  40. intersection.difference_update(old_class_intersection)
  41. # Add new class for the intersection with old class
  42. sets.append(old_class_intersection)
  43. for thing in old_class_intersection:
  44. mapping[thing] = old_class_intersection
  45. del old_class_intersection
  46. def update(self, list_of_sets):
  47. """
  48. Add a a list of sets to the classifier. Any iterable of iterables is accepted.
  49. """
  50. for s in list_of_sets:
  51. self.add(s)
  52. def _process(self):
  53. if not self._dirty:
  54. return
  55. # Do any deferred processing
  56. sets = self._sets
  57. self._sets = [s for s in sets if s]
  58. if self._sort:
  59. self._sets = sorted(self._sets, key=lambda s: (-len(s), sorted(s)))
  60. self._dirty = False
  61. # Output methods
  62. def getThings(self):
  63. """Returns the set of all things known so far.
  64. The return value belongs to the Classifier object and should NOT
  65. be modified while the classifier is still in use.
  66. """
  67. self._process()
  68. return self._things
  69. def getMapping(self):
  70. """Returns the mapping from things to their class set.
  71. The return value belongs to the Classifier object and should NOT
  72. be modified while the classifier is still in use.
  73. """
  74. self._process()
  75. return self._mapping
  76. def getClasses(self):
  77. """Returns the list of class sets.
  78. The return value belongs to the Classifier object and should NOT
  79. be modified while the classifier is still in use.
  80. """
  81. self._process()
  82. return self._sets
  83. def classify(list_of_sets, sort=True):
  84. """
  85. Takes a iterable of iterables (list of sets from here on; but any
  86. iterable works.), and returns the smallest list of sets such that
  87. each set, is either a subset, or is disjoint from, each of the input
  88. sets.
  89. In other words, this function classifies all the things present in
  90. any of the input sets, into similar classes, based on which sets
  91. things are a member of.
  92. If sort=True, return class sets are sorted by decreasing size and
  93. their natural sort order within each class size. Otherwise, class
  94. sets are returned in the order that they were identified, which is
  95. generally not significant.
  96. >>> classify([]) == ([], {})
  97. True
  98. >>> classify([[]]) == ([], {})
  99. True
  100. >>> classify([[], []]) == ([], {})
  101. True
  102. >>> classify([[1]]) == ([{1}], {1: {1}})
  103. True
  104. >>> classify([[1,2]]) == ([{1, 2}], {1: {1, 2}, 2: {1, 2}})
  105. True
  106. >>> classify([[1],[2]]) == ([{1}, {2}], {1: {1}, 2: {2}})
  107. True
  108. >>> classify([[1,2],[2]]) == ([{1}, {2}], {1: {1}, 2: {2}})
  109. True
  110. >>> classify([[1,2],[2,4]]) == ([{1}, {2}, {4}], {1: {1}, 2: {2}, 4: {4}})
  111. True
  112. >>> classify([[1,2],[2,4,5]]) == (
  113. ... [{4, 5}, {1}, {2}], {1: {1}, 2: {2}, 4: {4, 5}, 5: {4, 5}})
  114. True
  115. >>> classify([[1,2],[2,4,5]], sort=False) == (
  116. ... [{1}, {4, 5}, {2}], {1: {1}, 2: {2}, 4: {4, 5}, 5: {4, 5}})
  117. True
  118. >>> classify([[1,2,9],[2,4,5]], sort=False) == (
  119. ... [{1, 9}, {4, 5}, {2}], {1: {1, 9}, 2: {2}, 4: {4, 5}, 5: {4, 5},
  120. ... 9: {1, 9}})
  121. True
  122. >>> classify([[1,2,9,15],[2,4,5]], sort=False) == (
  123. ... [{1, 9, 15}, {4, 5}, {2}], {1: {1, 9, 15}, 2: {2}, 4: {4, 5},
  124. ... 5: {4, 5}, 9: {1, 9, 15}, 15: {1, 9, 15}})
  125. True
  126. >>> classes, mapping = classify([[1,2,9,15],[2,4,5],[15,5]], sort=False)
  127. >>> set([frozenset(c) for c in classes]) == set(
  128. ... [frozenset(s) for s in ({1, 9}, {4}, {2}, {5}, {15})])
  129. True
  130. >>> mapping == {1: {1, 9}, 2: {2}, 4: {4}, 5: {5}, 9: {1, 9}, 15: {15}}
  131. True
  132. """
  133. classifier = Classifier(sort=sort)
  134. classifier.update(list_of_sets)
  135. return classifier.getClasses(), classifier.getMapping()
  136. if __name__ == "__main__":
  137. import sys, doctest
  138. sys.exit(doctest.testmod(optionflags=doctest.ELLIPSIS).failed)