Source code for CorrectOCR.heuristics

from __future__ import annotations

import logging
from collections import Counter, OrderedDict, defaultdict
from dataclasses import dataclass, replace, field
from typing import Callable, DefaultDict, Dict, List, TYPE_CHECKING

import progressbar

from . import punctuationRE
if TYPE_CHECKING:
	from .dictionary import Dictionary
	from .tokens import Token


[docs]class Heuristics(object):
	log = logging.getLogger(f'{__name__}.Heuristics')

[docs]	@classmethod
	def bin(cls, n: int) -> Bin:
		return _bins[n]._copy()

	def __init__(self, settings: Dict[int, str], dictionary):
		"""
		:param settings: A dictionary of ``bin`` => ``heuristic`` settings.
		:param dictionary: A dictionary for determining correctness of :class:`Tokens<CorrectOCR.tokens.Token>` and suggestions.
		"""
		for (_bin, code) in settings.items():
			_bins[int(_bin)].heuristic = code
		for (number, _bin) in _bins.items():
			_bin.number = number
		Heuristics.log.debug(f'Bins: {_bins}')
		self.dictionary = dictionary
		self.tokenCount = 0
		self.totalCount = 0
		self.punctuationCount = 0
		self.nogoldCount = 0
		self.oversegmented = 0
		self.undersegmented = 0

[docs]	def bin_for_token(self, token: 'Token'):
		# k best candidates which are in dictionary
		filtids = [n for n, item in token.kbest.items() if item.candidate in self.dictionary]

		dcode = None
		if len(filtids) == 0:
			dcode = 'zerokd'
		elif 0 < len(filtids) < token.k:
			dcode = 'somekd'
		elif len(filtids) == token.k:
			dcode = 'allkd'

		token_bin = None
		for num, _bin in _bins.items():
			if _bin.matcher(token.normalized, token.kbest[1].candidate, self.dictionary, dcode):
				token_bin = _bin._copy()
				break

		# return decision and chosen candidate(s)
		if token_bin.heuristic == 'o':
			(decision, selection) = ('original', token.original)
		elif token_bin.heuristic == 'k':
			(decision, selection) = ('kbest', 1)
		elif token_bin.heuristic == 'd':
			(decision, selection) = ('kdict', filtids[0])
		else:
			# heuristic is 'a' or unrecognized
			(decision, selection) = ('annotator', filtids)
		
		return decision, selection, token_bin

[docs]	def bin_tokens(self, tokens: List['Token'], force = False):
		Heuristics.log.info('Running heuristics on tokens to determine annotator workload.')
		counts = Counter()
		annotatorRequired = 0
		for t in progressbar.progressbar(tokens):
			if force or not t.bin:
				t.decision, t.selection, t.bin = self.bin_for_token(t)
			counts[t.bin.number] += 1
			if t.decision == 'annotator':
				annotatorRequired += 1
		Heuristics.log.debug(f'Counts for each bin: {counts}')
		Heuristics.log.info(f'Annotator required for {annotatorRequired} of {len(tokens)} tokens.')

[docs]	def add_to_report(self, token: 'Token'):
		self.totalCount += 1

		if token.is_punctuation():
			self.punctuationCount += 1
			return

		# if the token or gold column is empty, a word segmentation error probably occurred in the original
		# (though possibly a deletion)
		# don't count any other errors here; they will be counted in the segmentation error's other half.
		if token.original == '' and len(token.gold) > 0:
			self.undersegmented += 1 # words ran together in original / undersegmentation
			return

		if token.gold == '' and len(token.original) > 0:
			self.oversegmented += 1 # word wrongly broken apart in original / oversegmentation
			return

		if len(token.gold) == 0:
			self.nogoldCount += 1

		# strip punctuation, which is considered not relevant to evaluation
		gold = punctuationRE.sub('', token.gold) # gold standard wordform
		original = punctuationRE.sub('', token.original) # original uncorrected wordform

		# total number of real tokens - controlled for segmentation errors
		self.tokenCount += 1

		# an evidently useful quantity for sorting out what to send to annotators
		#  - can split any existing category across a threshold of this quantity
		#	(based on probabilities of best and 2nd-best candidates)
		# qqh = (token.kbest[1].probablity-token.kbest[2].probability) / token.kbest[1].probability

		(_, _, _bin) = self.bin_for_token(token)

		if not _bin:
			return # was unable to make heuristic decision

		if 'example' not in _bins[_bin.number] and len(original) > 3:
			_bins[_bin.number].example = token

		counts = _bins[_bin.number].counts
		counts['total'] += 1

		if original == gold:
			counts['1 gold == orig'] += 1

		if token.kbest[1].candidate == gold:
			counts['2 gold == k1'] += 1

		# lower k best candidate words that pass the dictionary check
		kbest_filtered = [item.candidate for (k, item) in token.kbest if item.candidate in self.dictionary and k > 1]

		if gold in kbest_filtered:
			counts['3 gold == lower kbest'] += 1

[docs]	def report(self) -> str:
		Heuristics.log.debug(f'{[(i, b.counts) for i,b in _bins.items()]}')

		out = ''

		out += f'Total tokens included in evaluation: {self.totalCount:10d}         '.rjust(60) + '\n\n'
		out += f'Tokens without gold correction: {self.nogoldCount:10d} ({self.nogoldCount/self.totalCount:6.2%})'.rjust(60) + '\n\n'
		out += f'Oversegmented: {self.oversegmented:10d} ({self.oversegmented/self.totalCount:6.2%})'.rjust(60) + '\n'
		out += f'Undersegmented: {self.undersegmented:10d} ({self.undersegmented/self.totalCount:6.2%})'.rjust(60) + '\n'
		out += f'Tokens that are punctuation: {self.punctuationCount:10d} ({self.punctuationCount/self.totalCount:6.2%})'.rjust(60) + '\n\n'
		out += f'Tokens available for evaluation: {self.tokenCount:10d} ({self.tokenCount/self.totalCount:6.2%})'.rjust(60) + '\n\n'
		out += 'Choose from these options for each bin:\n'
		out += '\ta (annotator)\n'
		out += '\to (original)\n'
		out += '\tk (k1, best candidate)\n'
		out += '\td (best candidate in dictionary)\n'
		out += '(o and k interchangeable when original is identical to k1; d not applicable in all bins)\n\n\n\n'

		for num, _bin in _bins.items():
			out += f'BIN {num} \t\t\t\t\t\t\t\t enter decision here:\t\n'
			out += _bin.description + '\n'
			if 'counts' in _bin:
				total = _bin.counts.pop('total', 0)
				for name, count in sorted(_bin.counts.items(), key=lambda x: x[0]):
					out += f'{name[2:]:20}:{count:10d} ({count/total:6.2%})'.rjust(60) + '\n'
				out += f'total:{total:10d} ({total/self.tokenCount:6.2%})'.rjust(60) + '\n'
				_bin.counts['total'] = total
			else:
				out += '\tNo tokens matched.'
			if 'example' in _bin:
				example = _bin.example
				out += f'Example:\n'
				out += f'\toriginal = {example.original}\n'
				out += f'\tgold = {example.gold}\n'
				out += '\tkbest = [\n'
				for k, item in example.kbest:
					inDict = ' * is in dictionary' if item.candidate in self.dictionary else ''
					out += f'\t\t{k}: {item.candidate} ({item.probability:.2e}){inDict}\n'
				out += '\t]\n'
			out += '\n\n\n'

		return out


##########################################################################################


[docs]@dataclass
class Bin:
	"""
	Heuristics bin ...

	TODO TABLE
	"""
	description: str
	"""Description of bin"""
	matcher: Callable[[str, str, Dictionary, str], bool]
	"""Function or lambda which returns `True` if a given :class:`CorrectOCR.tokens.Token` fits into the bin, or `False` otherwise.

	:param o: Original string
	:param k: *k*-best candidate string
	:param d: Dictionary
	:param dcode: One of 'zerokd', 'somekd', 'allkd' for whether zero, some, or all other *k*-best candidates are in dictionary
	"""
	heuristic: str = 'a'
	"""
	Which heuristic the bin is set up for, one of:
	
	-  'a' = Defer to annotator.
	-  'o' = Select original.
	-  'k' = Select top *k*-best.
	-  'd' = Select *k*-best in dictionary.
	"""
	number: int = None #: The number of the bin.
	counts: DefaultDict[str, int] = field(default_factory=lambda: defaultdict(int)) #: Statistics used for reporting.
	example: Token = None #: An example of a matching :class:`CorrectOCR.tokens.Token`, used for reporting.

	def _copy(self):
		return replace(self)


##########################################################################################


_bins: Dict[int, Bin] = OrderedDict({
	1: Bin(
		description='k1 == original and both are in dictionary.',
		matcher=lambda o, k, d, dcode: o == k and o in d,
	),
	2: Bin(
		description='k1 == original but they are not in dictionary, and no other kbest is in dictionary either.',
		matcher=lambda o, k, d, dcode: o == k and o not in d and dcode == 'zerokd',
	),
	3: Bin(
		description='k1 == original but they are not in dictionary, but some lower-ranked kbest is.',
		matcher=lambda o, k, d, dcode: o == k and o not in d and dcode == 'somekd',
	),
	4: Bin(
		description='k1 != original and is in dictionary while original isn''t.',
		matcher=lambda o, k, d, dcode: o != k and o not in d and k in d,
	),
	5: Bin(
		description='k1 != original and nothing is in dictionary.',
		matcher=lambda o, k, d, dcode: o != k and o not in d and dcode == 'zerokd',
	),
	6: Bin(
		description='k1 != original and neither are in dictionary, but a lower-ranked candidate is.',
		matcher=lambda o, k, d, dcode: o != k and k not in d and o not in d and dcode == 'somekd',
	),
	7: Bin(
		description='k1 != original and both are in dictionary.',
		matcher=lambda o, k, d, dcode: o != k and o in d and k in d,
	),
	8: Bin(
		description='k1 != original, original is in dictionary and no candidates are in dictionary.',
		matcher=lambda o, k, d, dcode: o != k and o in d and dcode == 'zerokd',
	),
	9: Bin(
		description='k1 != original, k1 is not in dictionary but both original and a lower candidate are.',
		matcher=lambda o, k, d, dcode: o != k and o in d and k not in d and dcode == 'somekd',
	),
	10: Bin(
		description='Catch-all bin, matches any remaining tokens. It is recommended to pass this to annotator.',
		matcher=lambda o, k, d, dcode: True,
	)
})