Source code for CorrectOCR.heuristics

from __future__ import annotations

import logging
from collections import Counter, OrderedDict, defaultdict
from dataclasses import dataclass, replace, field
from typing import Callable, DefaultDict, Dict, List, TYPE_CHECKING

import progressbar

from . import punctuationRE
	from .dictionary import Dictionary
	from .tokens import Token

[docs]class Heuristics(object): log = logging.getLogger(f'{__name__}.Heuristics')
[docs] @classmethod def bin(cls, n: int) -> Bin: return _bins[n]._copy()
def __init__(self, settings: Dict[int, str], dictionary): """ :param settings: A dictionary of ``bin`` => ``heuristic`` settings. :param dictionary: A dictionary for determining correctness of :class:`Tokens<CorrectOCR.tokens.Token>` and suggestions. """ for (_bin, code) in settings.items(): _bins[int(_bin)].heuristic = code for (number, _bin) in _bins.items(): _bin.number = number Heuristics.log.debug(f'Bins: {_bins}') self.dictionary = dictionary self.tokenCount = 0 self.totalCount = 0 self.punctuationCount = 0 self.nogoldCount = 0 self.oversegmented = 0 self.undersegmented = 0
[docs] def bin_for_token(self, token: 'Token'): # k best candidates which are in dictionary filtids = [n for n, item in token.kbest.items() if item.candidate in self.dictionary] dcode = None if len(filtids) == 0: dcode = 'zerokd' elif 0 < len(filtids) < token.k: dcode = 'somekd' elif len(filtids) == token.k: dcode = 'allkd' token_bin = None for num, _bin in _bins.items(): if _bin.matcher(token.normalized, token.kbest[1].candidate, self.dictionary, dcode): token_bin = _bin._copy() break # return decision and chosen candidate(s) if token_bin.heuristic == 'o': (decision, selection) = ('original', token.original) elif token_bin.heuristic == 'k': (decision, selection) = ('kbest', 1) elif token_bin.heuristic == 'd': (decision, selection) = ('kdict', filtids[0]) else: # heuristic is 'a' or unrecognized (decision, selection) = ('annotator', filtids) return decision, selection, token_bin
[docs] def bin_tokens(self, tokens: List['Token'], force = False):'Running heuristics on tokens to determine annotator workload.') counts = Counter() annotatorRequired = 0 for t in progressbar.progressbar(tokens): if force or not t.bin: t.decision, t.selection, t.bin = self.bin_for_token(t) counts[t.bin.number] += 1 if t.decision == 'annotator': annotatorRequired += 1 Heuristics.log.debug(f'Counts for each bin: {counts}')'Annotator required for {annotatorRequired} of {len(tokens)} tokens.')
[docs] def add_to_report(self, token: 'Token'): self.totalCount += 1 if token.is_punctuation(): self.punctuationCount += 1 return # if the token or gold column is empty, a word segmentation error probably occurred in the original # (though possibly a deletion) # don't count any other errors here; they will be counted in the segmentation error's other half. if token.original == '' and len( > 0: self.undersegmented += 1 # words ran together in original / undersegmentation return if == '' and len(token.original) > 0: self.oversegmented += 1 # word wrongly broken apart in original / oversegmentation return if len( == 0: self.nogoldCount += 1 # strip punctuation, which is considered not relevant to evaluation gold = punctuationRE.sub('', # gold standard wordform original = punctuationRE.sub('', token.original) # original uncorrected wordform # total number of real tokens - controlled for segmentation errors self.tokenCount += 1 # an evidently useful quantity for sorting out what to send to annotators # - can split any existing category across a threshold of this quantity # (based on probabilities of best and 2nd-best candidates) # qqh = (token.kbest[1].probablity-token.kbest[2].probability) / token.kbest[1].probability (_, _, _bin) = self.bin_for_token(token) if not _bin: return # was unable to make heuristic decision if 'example' not in _bins[_bin.number] and len(original) > 3: _bins[_bin.number].example = token counts = _bins[_bin.number].counts counts['total'] += 1 if original == gold: counts['1 gold == orig'] += 1 if token.kbest[1].candidate == gold: counts['2 gold == k1'] += 1 # lower k best candidate words that pass the dictionary check kbest_filtered = [item.candidate for (k, item) in token.kbest if item.candidate in self.dictionary and k > 1] if gold in kbest_filtered: counts['3 gold == lower kbest'] += 1
[docs] def report(self) -> str: Heuristics.log.debug(f'{[(i, b.counts) for i,b in _bins.items()]}') out = '' out += f'Total tokens included in evaluation: {self.totalCount:10d} '.rjust(60) + '\n\n' out += f'Tokens without gold correction: {self.nogoldCount:10d} ({self.nogoldCount/self.totalCount:6.2%})'.rjust(60) + '\n\n' out += f'Oversegmented: {self.oversegmented:10d} ({self.oversegmented/self.totalCount:6.2%})'.rjust(60) + '\n' out += f'Undersegmented: {self.undersegmented:10d} ({self.undersegmented/self.totalCount:6.2%})'.rjust(60) + '\n' out += f'Tokens that are punctuation: {self.punctuationCount:10d} ({self.punctuationCount/self.totalCount:6.2%})'.rjust(60) + '\n\n' out += f'Tokens available for evaluation: {self.tokenCount:10d} ({self.tokenCount/self.totalCount:6.2%})'.rjust(60) + '\n\n' out += 'Choose from these options for each bin:\n' out += '\ta (annotator)\n' out += '\to (original)\n' out += '\tk (k1, best candidate)\n' out += '\td (best candidate in dictionary)\n' out += '(o and k interchangeable when original is identical to k1; d not applicable in all bins)\n\n\n\n' for num, _bin in _bins.items(): out += f'BIN {num} \t\t\t\t\t\t\t\t enter decision here:\t\n' out += _bin.description + '\n' if 'counts' in _bin: total = _bin.counts.pop('total', 0) for name, count in sorted(_bin.counts.items(), key=lambda x: x[0]): out += f'{name[2:]:20}:{count:10d} ({count/total:6.2%})'.rjust(60) + '\n' out += f'total:{total:10d} ({total/self.tokenCount:6.2%})'.rjust(60) + '\n' _bin.counts['total'] = total else: out += '\tNo tokens matched.' if 'example' in _bin: example = _bin.example out += f'Example:\n' out += f'\toriginal = {example.original}\n' out += f'\tgold = {}\n' out += '\tkbest = [\n' for k, item in example.kbest: inDict = ' * is in dictionary' if item.candidate in self.dictionary else '' out += f'\t\t{k}: {item.candidate} ({item.probability:.2e}){inDict}\n' out += '\t]\n' out += '\n\n\n' return out
[docs]@dataclass class Bin: """ Heuristics bin ... TODO TABLE """ description: str """Description of bin""" matcher: Callable[[str, str, Dictionary, str], bool] """Function or lambda which returns `True` if a given :class:`CorrectOCR.tokens.Token` fits into the bin, or `False` otherwise. :param o: Original string :param k: *k*-best candidate string :param d: Dictionary :param dcode: One of 'zerokd', 'somekd', 'allkd' for whether zero, some, or all other *k*-best candidates are in dictionary """ heuristic: str = 'a' """ Which heuristic the bin is set up for, one of: - 'a' = Defer to annotator. - 'o' = Select original. - 'k' = Select top *k*-best. - 'd' = Select *k*-best in dictionary. """ number: int = None #: The number of the bin. counts: DefaultDict[str, int] = field(default_factory=lambda: defaultdict(int)) #: Statistics used for reporting. example: Token = None #: An example of a matching :class:`CorrectOCR.tokens.Token`, used for reporting. def _copy(self): return replace(self)
########################################################################################## _bins: Dict[int, Bin] = OrderedDict({ 1: Bin( description='k1 == original and both are in dictionary.', matcher=lambda o, k, d, dcode: o == k and o in d, ), 2: Bin( description='k1 == original but they are not in dictionary, and no other kbest is in dictionary either.', matcher=lambda o, k, d, dcode: o == k and o not in d and dcode == 'zerokd', ), 3: Bin( description='k1 == original but they are not in dictionary, but some lower-ranked kbest is.', matcher=lambda o, k, d, dcode: o == k and o not in d and dcode == 'somekd', ), 4: Bin( description='k1 != original and is in dictionary while original isn''t.', matcher=lambda o, k, d, dcode: o != k and o not in d and k in d, ), 5: Bin( description='k1 != original and nothing is in dictionary.', matcher=lambda o, k, d, dcode: o != k and o not in d and dcode == 'zerokd', ), 6: Bin( description='k1 != original and neither are in dictionary, but a lower-ranked candidate is.', matcher=lambda o, k, d, dcode: o != k and k not in d and o not in d and dcode == 'somekd', ), 7: Bin( description='k1 != original and both are in dictionary.', matcher=lambda o, k, d, dcode: o != k and o in d and k in d, ), 8: Bin( description='k1 != original, original is in dictionary and no candidates are in dictionary.', matcher=lambda o, k, d, dcode: o != k and o in d and dcode == 'zerokd', ), 9: Bin( description='k1 != original, k1 is not in dictionary but both original and a lower candidate are.', matcher=lambda o, k, d, dcode: o != k and o in d and k not in d and dcode == 'somekd', ), 10: Bin( description='Catch-all bin, matches any remaining tokens. It is recommended to pass this to annotator.', matcher=lambda o, k, d, dcode: True, ) })