Source code for CorrectOCR.tokens._super

from __future__ import annotations

import abc
import collections
import datetime
import json
import logging
import string
import traceback
from pathlib import Path
from typing import Any, DefaultDict, List, NamedTuple, Optional, Tuple

import nltk

from .list import TokenList
from .._util import punctuationRE
from ..fileio import FileIO
from ..heuristics import Bin
from ..model.kbest import KBestItem


[docs]def tokenize_str(data: str, language='english') -> List[str]:
	return nltk.tokenize.word_tokenize(data, language.lower())


##########################################################################################


class UpdateModifiedAccess:
	def __set_name__(self, owner, name):
		self.public_name = name
		self.private_name = '_' + name
		self.post_effect_name = '_post_' + name

	def __get__(self, obj, objtype=None):
		return getattr(obj, self.private_name)

	def __set__(self, obj, value):
		obj.last_modified = datetime.datetime.now()
		setattr(obj, self.private_name, value)
		if hasattr(obj, self.post_effect_name):
			getattr(obj, self.post_effect_name)(value)


##########################################################################################


[docs]class Token(abc.ABC):
	"""
	Abstract base class. Tokens handle single words. ...
	"""
	_subclasses = dict()
	gold = UpdateModifiedAccess()
	is_hyphenated = UpdateModifiedAccess()
	is_discarded = UpdateModifiedAccess()
	has_error = UpdateModifiedAccess()

	def _post_is_discarded(self, value):
		if value is True:
			self.gold = ''

[docs]	@staticmethod
	def register(cls):
		"""
		Decorator which registers a :class:`Token` subclass with the base class.

		:param cls: Token subclass
		"""
		Token._subclasses[cls.__name__] = cls
		return cls

	def __init__(self, original: str, docid: str, index: int):
		"""
		:param original: Original spelling of the token.
		:param docid: The doc with which the Token is associated.
		"""
		if type(self) is Token:
			raise TypeError("Token base class cannot not be directly instantiated")
		if docid is None:
			raise ValueError('Tokens must have a docid!')
		if index is None:
			raise ValueError('Tokens must have an index!')
		self.original = original
		self.docid = docid  #: The doc with which the Token is associated.
		self.index = index #: The placement of the Token in the doc.
		self.gold = None # (documented in @property methods below)
		self.bin: Optional[Bin] = None  #: Heuristics bin.
		self.kbest: DefaultDict[int, KBestItem] = collections.defaultdict(KBestItem)
		"""
		Dictionary of *k*-best suggestions for the Token. They are keyed
		with a numerical index starting at 1, and the values are instances
		of :class:`KBestItem`.
		"""
		self.heuristic: Optional[str] = None #: The heuristic that was was determined by the bin.
		self.selection: Any = None #: The selected automatic correction for the :attr:`heuristic`.
		self.is_hyphenated = False # (documented in @property methods below)
		self.is_discarded = False #: (documented in @property methods below)

		self.annotations = [] #: A list of arbitrary key/value info about the annotations
		self.has_error = False #: Whether the token has an unhandled error
		self.last_modified = None #: When one of the ``gold``, ``ìs_hyphenated``, ``is_discarded``, or ``has_error`` properties were last updated.

		self.cached_image_path = FileIO.imageCache(self.docid).joinpath(
			f'{self.index}.png'
		) #: Where the image file should be cached. Is not guaranteed to exist, but can be generated via extract_image()

		if self.is_punctuation():
			#self.__class__.log.debug(f'{self}: is_punctuation')
			self._gold = self.original

	@property
	@abc.abstractmethod
	def token_info(self) -> Any:
		"""

		:return:
		"""
		return None

	@property
	@abc.abstractmethod
	def page(self) -> int:
		"""
		The page of the document on which the token is located.
		
		May not be applicable for all token types.
		
		:return: The page number.
		"""
		return None

	@property
	@abc.abstractmethod
	def frame(self) -> (int, int, int, int):
		"""
		The coordinates of the token's location on the page.
		
		Takes the form [x0, y0, x1, y1] where (x0, y0) is the top-left corner, and
		(x1, y1) is the bottom-right corner.
		
		May not be applicable for all token types.
		
		:return: The frame coordinates.
		"""
		return None

	@property
	def k(self) -> int:
		"""
		The number of *k*-best suggestions for the Token.
		"""
		return len(self.kbest)

	def __str__(self):
		return f'<{self.__class__.__name__} {vars(self)}>'

	def __repr__(self):
		return self.__str__()

	def __eq__(self, other):
		if isinstance(other, self.__class__):
			return self.original.__eq__(other.original)
		elif isinstance(other, str):
			return self.original.__eq__(other)
		else:
			return NotImplemented

	def __lt__(self, other):
		if isinstance(other, self.__class__):
			return self.original.__lt__(other.original)
		elif isinstance(other, str):
			return self.original.__lt__(other)
		else:
			return NotImplemented

	def __hash__(self):
		return self.original.__hash__()

[docs]	def is_punctuation(self) -> bool:
		"""
		Is the Token purely punctuation?
		"""
		#self.__class__.log.debug(f'{self}')
		return punctuationRE.fullmatch(self.original)

[docs]	def is_numeric(self) -> bool:
		"""
		Is the Token purely numeric?
		"""
		return self.original.isnumeric()

	@property
	def __dict__(self):
		output = {
			'Gold': self.gold,
			'Original': self.original,
			'Doc ID': self.docid,
			'Index': self.index,
			'Hyphenated': self.is_hyphenated,
			'Discarded': self.is_discarded,
			'Page': self.page,
			'Frame': self.frame,
		}
		output['k-best'] = dict()
		for k, item in self.kbest.items():
			output['k-best'][k] = vars(item)
		if self.bin:
			output['Bin'] = self.bin.number
		#else:
		#	raise ValueError(f'Bin missing in __dict__(): {t}')
		output['Heuristic'] = self.heuristic
		output['Selection'] = self.selection
		output['Token type'] = self.__class__.__name__
		output['Token info'] = json.dumps(self.token_info)
		output['Annotations'] = json.dumps(self.annotations)
		output['Has error'] = self.has_error
		output['Last Modified'] = self.last_modified.timestamp() if self.last_modified else None

		return output

[docs]	@classmethod
	def from_dict(cls, d: dict) -> Token:
		"""
		Initialize and return a new Token with values from a dictionary.

		:param d: A dictionary of properties for the Token
		"""
		if not isinstance(d, collections.Mapping):
			raise ValueError(f'Object is not dict-like: {d}')
		classname = d['Token type']
		#self.__class__.log.debug(f'from_dict: {d}')
		t = Token._subclasses[classname](
			json.loads(d['Token info']),
			d['Doc ID'],
			d['Index']
		)
		try:
			t.gold = d.get('Gold', None)
			t.is_hyphenated = bool(d.get('Hyphenated', False))
			t.is_discarded = bool(d.get('Discarded', False))
			t.annotations = json.loads(d.get('Annotations', []))
			t.has_error = bool(d.get('Has error', False))

			t.last_modified = d['Last Modified'] if 'Last Modified' in d else None
			if 'k-best' in d:
				kbest = collections.defaultdict(KBestItem)
				for k, b in d['k-best'].items():
					kbest[k] = KBestItem(b['candidate'], b['probability'])
				t.kbest = kbest
			if 'Bin' in d and d['Bin'] not in (None, '', '-1', -1):
				from ..heuristics import Heuristics
				t.bin = Heuristics.bin(int(d['Bin']))
			#else:
			#	raise ValueError(f'Bin: {d.get("Bin", None)} in from_dict(): {t}')
			t.heuristic = d.get('Heuristic', None)
			t.selection = d.get('Selection', None)
			#t.__class__.log.debug(t)
		except:
			raise ValueError(f'Could not initialize token {t} from {d}')
		return t

[docs]	def drop_cached_image(self):
		if self.cached_image_path.is_file():
			try:
				self.cached_image_path.unlink()
			except:
				self.__class__.log.error(f'Could not delete image:\n{traceback.format_exc()}')

[docs]	def extract_image(self, workspace, highlight_word=True, left=300, right=300, top=15, bottom=15, force=False) -> Tuple[Path, Any]:
		pass


##########################################################################################


[docs]class Tokenizer(abc.ABC):
	"""
	Abstract base class. The `Tokenizer` subclasses handle extracting :class:`Token` instances from a document.
	"""
	log = logging.getLogger(f'{__name__}.Tokenizer')
	_subclasses = dict()

[docs]	@staticmethod
	def register(extensions: List[str]):
		"""
		Decorator which registers a :class:`Tokenizer` subclass with the base class.

		:param extensions: List of extensions that the subclass will handle
		"""
		def wrapper(cls):
			for ext in extensions:
				Tokenizer._subclasses[ext] = cls
			return cls
		return wrapper

[docs]	@staticmethod
	def for_extension(ext: str) -> TokenList.__class__:
		"""
		Obtain the suitable subclass for the given extension. Currently, Tokenizers are
		provided for the following extensions:

		-  ``.txt`` -- plain old text.
		-  ``.pdf`` -- assumes the PDF contains images and OCRed text.
		-  ``.tiff`` -- will run OCR on the image and generate a PDF.
		-  ``.png`` -- will run OCR on the image and generate a PDF.

		:param ext: Filename extension (including leading period).
		:return: A Tokenizer subclass.
		"""
		Tokenizer.log.debug(f'_subclasses: {Tokenizer._subclasses}')
		return Tokenizer._subclasses[ext]

	def __init__(self, language):
		"""

		:type language: :class:`pycountry.Language`
		:param language: The language to use for tokenization (for example, the `.txt` tokenizer internally uses nltk whose tokenizers function best with a language parameter).
		"""
		self.language = language
		self.tokens = []

[docs]	@abc.abstractmethod
	def tokenize(self, file: Path, storageconfig) -> TokenList:
		"""
		Generate tokens for the given document.

		:param storageconfig: Storage configuration (database, filesystem) for resulting Tokens
		:param file: A given document.
		:return:
		"""
		pass

[docs]	@staticmethod
	@abc.abstractmethod
	def apply(original: Path, tokens: TokenList, outfile: Path, highlight=False):
		pass

[docs]	@staticmethod
	@abc.abstractmethod
	def crop_tokens(original, config, tokens, edge_left = None, edge_right = None):
		pass