Source code for CorrectOCR.correcter

import cmd
import logging
from collections import deque
from typing import List, Iterator, TypeVar, Tuple

from .tokens import TokenList

'''
IMPORTANT BEFORE USING:
To display interactive text your environment must be compatible with the encoding.
For example:
> export LANG=is_IS.UTF-8
> export LC_ALL=is_IS.UTF-8
> locale
> export PYTHONIOENCODING=utf8
'''


T = TypeVar('T')
def _split_window(l: List[T], before=3, after=3) -> Iterator[Tuple[List[T], T, List[T]]]:
	a = deque(maxlen=before)
	for i in range(len(l)):
		yield list(a), l[i], l[i+1:i+1+after]
		a.append(l[i])


[docs]class CorrectionShell(cmd.Cmd):
	"""
	Interactive shell for making corrections to a list of tokens. Assumes that the
	tokens are `binned`.
	"""
	log = logging.getLogger(f'{__name__}.CorrectionShell')
	_prompt = 'CorrectOCR> '

	def __init__(self, tokens: TokenList, dictionary, correctionTracking: dict):
		super().__init__()
		self.token = None
		self.heuristic = None
		self.selection = None
		self.tokenwindow = _split_window(tokens, before=7, after=7)
		self.dictionary = dictionary
		self.metrics = {
			'tokenCount': 0,
			'humanCount': 0,
			'tokenTotal': len(tokens),
			'newWords': [],
			'correctionTracking': correctionTracking,
		}
		self.use_rawinput = True

[docs]	@classmethod
	def start(cls, tokens: TokenList, dictionary, correctionTracking: dict, intro: str = None):
		"""
		:param tokens: A list of Tokens.
		:param dictionary: A dictionary against which to check validity.
		:param correctionTracking: TODO
		:param intro: Optional introduction text.
		"""
		sh = CorrectionShell(tokens, dictionary, correctionTracking)
		sh.cmdloop(intro)
		return sh.metrics

	def preloop(self):
		return self._nexttoken()

	def _nexttoken(self):
		try:
			while True:  # do-while loop...
				ctxl, self.token, ctxr = next(self.tokenwindow)
				self.metrics['tokenCount'] += 1
				if not self.token.gold:
					break

			if self.token.heuristic == 'annotator':
				self.metrics['humanCount'] += 1 # increment human-effort count
				
				left = ' '.join([c.gold or c.original for c in ctxr])
				right = ' '.join([c.original for c in ctxl])
				print(f'\n\n...{left} \033[1;7m{self.token.original}\033[0m {right}...\n')
				print(f'\nSELECT for {self.token.original} :\n')
				for k, item in self.token.kbest.items():
					inDict = ' * is in dictionary' if item.candidate in self.dictionary else ''
					print(f'\t{k}. {item.candidate} ({item.probability:.2e}){inDict}\n')
				
				self._prompt = f"CorrectOCR {self.metrics['tokenCount']}/{self.metrics['tokenTotal']} ({self.metrics['humanCount']}) > "
			else:
				self.cmdqueue.insert(0, f'{self.token.heuristic} {self.token.selection}')
		except StopIteration:
			print('Reached end of tokens, going to quit...')
			return self.onecmd('quit')
	
	def _select(self, word: str, heuristic: str, save=True):
		print(f'Selecting {heuristic} for "{self.token.original}": "{word}"')
		self.token.gold = word
		if save:
			if word not in self.dictionary:
				self.metrics['newWords'].append(word) # add to suggestions for dictionary review
			self.dictionary.add(word) # add to current dictionary for subsequent heuristics
			if f'{self.token.original}\t{word}' not in self.metrics['correctionTracking']:
				self.metrics['correctionTracking'][f'{self.token.original}\t{word}'] = 0
			self.metrics['correctionTracking'][f'{self.token.original}\t{word}'] += 1
		return self._nexttoken()

	def emptyline(self):
		if self.lastcmd == 'original':
			return super().emptyline() # repeats by default
		else:
			pass # dont repeat other commands

[docs]	def do_original(self, _: str):
		"""Choose original (abbreviation: o)"""
		return self._select(self.token.original, 'original')

[docs]	def do_shell(self, arg: str):
		"""Custom input to replace token"""
		return self._select(arg, 'user input')

[docs]	def do_kbest(self, arg: str):
		"""Choose k-best by number (abbreviation: just the number)"""
		if arg:
			k = int(arg[0]) 
		else:
			k = 1
		return self._select(self.token.kbest[k].candidate, f'{k}-best')

[docs]	def do_kdict(self, arg: str):
		"""Choose k-best which is in dictionary"""
		return self._select(self.token.kbest[int(arg)], f'k-best from dict')

[docs]	def do_memoized(self, arg: str):
		return self._select(arg, 'memoized correction', save=False)

[docs]	def do_error(self, arg: str):
		CorrectionShell.log.error(f'ERROR: {arg} {self.token}')

[docs]	def do_linefeed(self, _: str):
		return self._select('\n', 'linefeed', save=False)

[docs]	def do_defer(self, _: str):
		"""Defer heuristic for another time."""
		print('Deferring heuristic...')
		return self._nexttoken()

	# noinspection PyMethodMayBeStatic
[docs]	def do_quit(self, _: str):
		return True

	def default(self, line: str):
		if line == 'o':
			self.cmdqueue.insert(0, 'original')
		elif line == 'k':
			self.cmdqueue.insert(0, 'kbest 1')
		elif line.isnumeric():
			self.cmdqueue.insert(0, f'kbest {line}')
		elif line == 'd':
			self.cmdqueue.insert(0, 'defer')
		elif line == 'q':
			self.cmdqueue.insert(0, 'quit')
		elif line == 'p':
			print(self.heuristic, self.selection, self.token) # for debugging
		else:
			CorrectionShell.log.error(f'bad command: "{line}"')
			return super().default(line)