Source code for CorrectOCR.tokens._super

from __future__ import annotations

import abc
import collections
import json
import logging
import string
from pathlib import Path
from typing import Any, DefaultDict, List, NamedTuple, Optional

import nltk
import regex

from .list import TokenList
from ..heuristics import Bin


[docs]def tokenize_str(data: str, language='english') -> List[str]:
	return nltk.tokenize.word_tokenize(data, language.lower())


##########################################################################################


[docs]class KBestItem(NamedTuple):
	candidate: str = ''
	probability: float = 0.0

	def __repr__(self) -> str:
		return f'<KBestItem {self.candidate}, {self.probability:.2e}>'


##########################################################################################


[docs]class Token(abc.ABC):
	"""
	Abstract base class. Tokens handle single words. ...
	"""
	_subclasses = dict()

[docs]	@staticmethod
	def register(cls):
		"""
		Decorator which registers a :class:`Token` subclass with the base class.

		:param cls: Token subclass
		"""
		Token._subclasses[cls.__name__] = cls
		return cls

	_punctuation_splitter = regex.compile(r'^(\p{punct}*)(.*?)(\p{punct}*)$')

	def __init__(self, original: str, docid: str, index: int):
		"""
		:param original: Original spelling of the token.
		:param docid: The doc with which the Token is associated.
		"""
		if type(self) is Token:
			raise TypeError("Token base class cannot not be directly instantiated")
		m = Token._punctuation_splitter.search(original)
		(self._punct_prefix, self.normalized, self._punct_suffix) = m.groups('')
		self.docid = docid  #: The doc with which the Token is associated.
		self.index = index #: The placement of the Token in the doc.
		self.gold = None
		self.bin: Optional[Bin] = None  #: Heuristics bin.
		self.kbest: DefaultDict[int, KBestItem] = collections.defaultdict(KBestItem)
		"""
		Dictionary of *k*-best suggestions for the Token. They are keyed
		with a numerical index starting at 1, and the values are instances
		of :class:`KBestItem`.
		"""
		self.decision: Optional[str] = None #: The decision that was made when :attr:`gold` was set automatically.
		self.selection: Any = None #: The selected automatic correction for the :attr:`decision`.
		self.is_hyphenated = False #: Whether the token is hyphenated to the following token.
		self.is_discarded = False #: Whether the token has been discarded (marked irrelevant by code or annotator).

		if self.is_punctuation():
			#self.__class__.log.debug(f'{self}: is_punctuation')
			self._gold = self.normalized

	@property
	@abc.abstractmethod
	def token_info(self) -> Any:
		"""

		:return:
		"""
		return None

	@property
	def original(self) -> str:
		"""
		The original spelling of the Token.
		"""
		return f'{self._punct_prefix}{self.normalized}{self._punct_suffix}'

	@property
	def gold(self) -> str:
		"""
		The corrected spelling of the Token.
		"""
		return f'{self._punct_prefix}{self._gold}{self._punct_suffix}' if self._gold is not None else None

	@gold.setter
	def gold(self, gold):
		self._gold = gold
		if self._gold:
			self._gold = self._gold.lstrip(string.punctuation).rstrip(string.punctuation)

	@property
	def k(self) -> int:
		"""
		The number of *k*-best suggestions for the Token.
		"""
		return len(self.kbest)

	def __str__(self):
		return f'<{self.__class__.__name__} "{self.original}" "{self.gold}" {self.kbest} {self.bin}>'

	def __repr__(self):
		return self.__str__()

	def __eq__(self, other):
		if isinstance(other, self.__class__):
			return self.original.__eq__(other.original)
		elif isinstance(other, str):
			return self.original.__eq__(other)
		else:
			return NotImplemented

	def __lt__(self, other):
		if isinstance(other, self.__class__):
			return self.original.__lt__(other.original)
		elif isinstance(other, str):
			return self.original.__lt__(other)
		else:
			return NotImplemented

	def __hash__(self):
		return self.original.__hash__()

	_is_punctuationRE = regex.compile(r'^\p{punct}+$')

[docs]	def is_punctuation(self) -> bool:
		"""
		Is the Token purely punctuation?
		"""
		#self.__class__.log.debug(f'{self}')
		return Token._is_punctuationRE.match(self.original)

[docs]	def is_numeric(self) -> bool:
		"""
		Is the Token purely numeric?
		"""
		return self.original.isnumeric()

	@property
	def __dict__(self):
		output = {
			'Gold': self.gold or '',
			'Original': self.original,
			'Doc ID': self.docid,
			'Index': self.index,
			'Hyphenated': self.is_hyphenated,
			'Discarded': self.is_discarded,
		}
		for k, item in self.kbest.items():
			output[f'{k}-best'] = item.candidate
			output[f'{k}-best prob.'] = item.probability
		if self.bin:
			output['Bin'] = self.bin.number or -1
			output['Heuristic'] = self.bin.heuristic
			output['Decision'] = self.decision
			output['Selection'] = self.selection
		output['Token type'] = self.__class__.__name__
		output['Token info'] = json.dumps(self.token_info)

		return output

[docs]	@classmethod
	def from_dict(cls, d: dict) -> Token:
		"""
		Initialize and return a new Token with values from a dictionary.

		:param d: A dictionary of properties for the Token
		"""
		if not isinstance(d, collections.Mapping):
			raise ValueError(f'Object is not dict-like: {d}')
		classname = d['Token type']
		#self.__class__.log.debug(f'from_dict: {d}')
		t = Token._subclasses[classname](
			json.loads(d['Token info']),
			d.get('Doc ID', None),
			d.get('Index', -1)
		)
		t.gold = d.get('Gold', None)
		t.is_hyphenated = d.get('Hyphenated', False)
		t.is_discarded = d.get('Discarded', False)
		kbest = collections.defaultdict(lambda: KBestItem(''))
		k = 1
		while f'{k}-best' in d:
			candidate = d[f'{k}-best']
			if candidate == '':
				break
			probability = d[f'{k}-best prob.']
			kbest[k] = KBestItem(candidate, float(probability))
			k += 1
		t.kbest = kbest
		if 'Bin' in d and d['Bin'] not in ('', '-1', -1):
			from ..heuristics import Heuristics
			t.bin = Heuristics.bin(int(d['Bin']))
			t.bin.heuristic = d['Heuristic']
			t.decision = d['Decision']
			t.selection = d['Selection']
		#t.__class__.log.debug(t)
		return t


##########################################################################################


[docs]class Tokenizer(abc.ABC):
	"""
	Abstract base class. The `Tokenizer` subclasses handle extracting :class:`Token` instances from a document.
	"""
	log = logging.getLogger(f'{__name__}.Tokenizer')
	_subclasses = dict()

[docs]	@staticmethod
	def register(extensions: List[str]):
		"""
		Decorator which registers a :class:`Tokenizer` subclass with the base class.

		:param extensions: List of extensions that the subclass will handle
		"""
		def wrapper(cls):
			for ext in extensions:
				Tokenizer._subclasses[ext] = cls
			return cls
		return wrapper

[docs]	@staticmethod
	def for_extension(ext: str) -> TokenList.__class__:
		"""
		Obtain the suitable subclass for the given extension. Currently, Tokenizers are
		provided for the following extensions:

		-  ``.txt`` -- plain old text.
		-  ``.pdf`` -- assumes the PDF contains images and OCRed text.
		-  ``.tiff`` -- will run OCR on the image and generate a PDF.
		-  ``.png`` -- will run OCR on the image and generate a PDF.

		:param ext: Filename extension (including leading period).
		:return: A Tokenizer subclass.
		"""
		Tokenizer.log.debug(f'_subclasses: {Tokenizer._subclasses}')
		return Tokenizer._subclasses[ext]

	def __init__(self, language, dehyphenate):
		"""

		:type language: :class:`pycountry.Language`
		:param language: The language to use for tokenization (for example, the `.txt` tokenizer internally uses nltk whose tokenizers function best with a language parameter).
		"""
		self.language = language
		self.dehyphenate = dehyphenate
		self.tokens = []

[docs]	@abc.abstractmethod
	def tokenize(self, file: Path, storageconfig) -> TokenList:
		"""
		Generate tokens for the given document.

		:param storageconfig: Storage configuration (database, filesystem) for resulting Tokens
		:param file: A given document.
		:return:
		"""
		pass

[docs]	@staticmethod
	@abc.abstractmethod
	def apply(original: Path, tokens: TokenList, corrected: Path):
		pass

[docs]	@staticmethod
	@abc.abstractmethod
	def crop_tokens(original, config, tokens, edge_left = None, edge_right = None):
		pass