Source code for CorrectOCR.tokens._super

from __future__ import annotations

import abc
import collections
import datetime
import json
import logging
import string
import traceback
from pathlib import Path
from typing import Any, DefaultDict, List, NamedTuple, Optional, Tuple

import nltk

from .list import TokenList
from .._util import punctuationRE
from ..fileio import FileIO
from ..heuristics import Bin
from ..model.kbest import KBestItem


[docs]def tokenize_str(data: str, language='english') -> List[str]: return nltk.tokenize.word_tokenize(data, language.lower())
########################################################################################## class UpdateModifiedAccess: def __set_name__(self, owner, name): self.public_name = name self.private_name = '_' + name self.post_effect_name = '_post_' + name def __get__(self, obj, objtype=None): return getattr(obj, self.private_name) def __set__(self, obj, value): obj.last_modified = datetime.datetime.now() setattr(obj, self.private_name, value) if hasattr(obj, self.post_effect_name): getattr(obj, self.post_effect_name)(value) ##########################################################################################
[docs]class Token(abc.ABC): """ Abstract base class. Tokens handle single words. ... """ _subclasses = dict() gold = UpdateModifiedAccess() is_hyphenated = UpdateModifiedAccess() is_discarded = UpdateModifiedAccess() has_error = UpdateModifiedAccess() def _post_is_discarded(self, value): if value is True: self.gold = ''
[docs] @staticmethod def register(cls): """ Decorator which registers a :class:`Token` subclass with the base class. :param cls: Token subclass """ Token._subclasses[cls.__name__] = cls return cls
def __init__(self, original: str, docid: str, index: int): """ :param original: Original spelling of the token. :param docid: The doc with which the Token is associated. """ if type(self) is Token: raise TypeError("Token base class cannot not be directly instantiated") if docid is None: raise ValueError('Tokens must have a docid!') if index is None: raise ValueError('Tokens must have an index!') self.original = original self.docid = docid #: The doc with which the Token is associated. self.index = index #: The placement of the Token in the doc. self.gold = None # (documented in @property methods below) self.bin: Optional[Bin] = None #: Heuristics bin. self.kbest: DefaultDict[int, KBestItem] = collections.defaultdict(KBestItem) """ Dictionary of *k*-best suggestions for the Token. They are keyed with a numerical index starting at 1, and the values are instances of :class:`KBestItem`. """ self.heuristic: Optional[str] = None #: The heuristic that was was determined by the bin. self.selection: Any = None #: The selected automatic correction for the :attr:`heuristic`. self.is_hyphenated = False # (documented in @property methods below) self.is_discarded = False #: (documented in @property methods below) self.annotations = [] #: A list of arbitrary key/value info about the annotations self.has_error = False #: Whether the token has an unhandled error self.last_modified = None #: When one of the ``gold``, ``ìs_hyphenated``, ``is_discarded``, or ``has_error`` properties were last updated. self.cached_image_path = FileIO.imageCache(self.docid).joinpath( f'{self.index}.png' ) #: Where the image file should be cached. Is not guaranteed to exist, but can be generated via extract_image() if self.is_punctuation(): #self.__class__.log.debug(f'{self}: is_punctuation') self._gold = self.original @property @abc.abstractmethod def token_info(self) -> Any: """ :return: """ return None @property @abc.abstractmethod def page(self) -> int: """ The page of the document on which the token is located. May not be applicable for all token types. :return: The page number. """ return None @property @abc.abstractmethod def frame(self) -> (int, int, int, int): """ The coordinates of the token's location on the page. Takes the form [x0, y0, x1, y1] where (x0, y0) is the top-left corner, and (x1, y1) is the bottom-right corner. May not be applicable for all token types. :return: The frame coordinates. """ return None @property def k(self) -> int: """ The number of *k*-best suggestions for the Token. """ return len(self.kbest) def __str__(self): return f'<{self.__class__.__name__} {vars(self)}>' def __repr__(self): return self.__str__() def __eq__(self, other): if isinstance(other, self.__class__): return self.original.__eq__(other.original) elif isinstance(other, str): return self.original.__eq__(other) else: return NotImplemented def __lt__(self, other): if isinstance(other, self.__class__): return self.original.__lt__(other.original) elif isinstance(other, str): return self.original.__lt__(other) else: return NotImplemented def __hash__(self): return self.original.__hash__()
[docs] def is_punctuation(self) -> bool: """ Is the Token purely punctuation? """ #self.__class__.log.debug(f'{self}') return punctuationRE.fullmatch(self.original)
[docs] def is_numeric(self) -> bool: """ Is the Token purely numeric? """ return self.original.isnumeric()
@property def __dict__(self): output = { 'Gold': self.gold, 'Original': self.original, 'Doc ID': self.docid, 'Index': self.index, 'Hyphenated': self.is_hyphenated, 'Discarded': self.is_discarded, 'Page': self.page, 'Frame': self.frame, } output['k-best'] = dict() for k, item in self.kbest.items(): output['k-best'][k] = vars(item) if self.bin: output['Bin'] = self.bin.number #else: # raise ValueError(f'Bin missing in __dict__(): {t}') output['Heuristic'] = self.heuristic output['Selection'] = self.selection output['Token type'] = self.__class__.__name__ output['Token info'] = json.dumps(self.token_info) output['Annotations'] = json.dumps(self.annotations) output['Has error'] = self.has_error output['Last Modified'] = self.last_modified.timestamp() if self.last_modified else None return output
[docs] @classmethod def from_dict(cls, d: dict) -> Token: """ Initialize and return a new Token with values from a dictionary. :param d: A dictionary of properties for the Token """ if not isinstance(d, collections.Mapping): raise ValueError(f'Object is not dict-like: {d}') classname = d['Token type'] #self.__class__.log.debug(f'from_dict: {d}') t = Token._subclasses[classname]( json.loads(d['Token info']), d['Doc ID'], d['Index'] ) try: t.gold = d.get('Gold', None) t.is_hyphenated = bool(d.get('Hyphenated', False)) t.is_discarded = bool(d.get('Discarded', False)) t.annotations = json.loads(d.get('Annotations', [])) t.has_error = bool(d.get('Has error', False)) t.last_modified = d['Last Modified'] if 'Last Modified' in d else None if 'k-best' in d: kbest = collections.defaultdict(KBestItem) for k, b in d['k-best'].items(): kbest[k] = KBestItem(b['candidate'], b['probability']) t.kbest = kbest if 'Bin' in d and d['Bin'] not in (None, '', '-1', -1): from ..heuristics import Heuristics t.bin = Heuristics.bin(int(d['Bin'])) #else: # raise ValueError(f'Bin: {d.get("Bin", None)} in from_dict(): {t}') t.heuristic = d.get('Heuristic', None) t.selection = d.get('Selection', None) #t.__class__.log.debug(t) except: raise ValueError(f'Could not initialize token {t} from {d}') return t
[docs] def drop_cached_image(self): if self.cached_image_path.is_file(): try: self.cached_image_path.unlink() except: self.__class__.log.error(f'Could not delete image:\n{traceback.format_exc()}')
[docs] def extract_image(self, workspace, highlight_word=True, left=300, right=300, top=15, bottom=15, force=False) -> Tuple[Path, Any]: pass
##########################################################################################
[docs]class Tokenizer(abc.ABC): """ Abstract base class. The `Tokenizer` subclasses handle extracting :class:`Token` instances from a document. """ log = logging.getLogger(f'{__name__}.Tokenizer') _subclasses = dict()
[docs] @staticmethod def register(extensions: List[str]): """ Decorator which registers a :class:`Tokenizer` subclass with the base class. :param extensions: List of extensions that the subclass will handle """ def wrapper(cls): for ext in extensions: Tokenizer._subclasses[ext] = cls return cls return wrapper
[docs] @staticmethod def for_extension(ext: str) -> TokenList.__class__: """ Obtain the suitable subclass for the given extension. Currently, Tokenizers are provided for the following extensions: - ``.txt`` -- plain old text. - ``.pdf`` -- assumes the PDF contains images and OCRed text. - ``.tiff`` -- will run OCR on the image and generate a PDF. - ``.png`` -- will run OCR on the image and generate a PDF. :param ext: Filename extension (including leading period). :return: A Tokenizer subclass. """ Tokenizer.log.debug(f'_subclasses: {Tokenizer._subclasses}') return Tokenizer._subclasses[ext]
def __init__(self, language): """ :type language: :class:`pycountry.Language` :param language: The language to use for tokenization (for example, the `.txt` tokenizer internally uses nltk whose tokenizers function best with a language parameter). """ self.language = language self.tokens = []
[docs] @abc.abstractmethod def tokenize(self, file: Path, storageconfig) -> TokenList: """ Generate tokens for the given document. :param storageconfig: Storage configuration (database, filesystem) for resulting Tokens :param file: A given document. :return: """ pass
[docs] @staticmethod @abc.abstractmethod def apply(original: Path, tokens: TokenList, outfile: Path, highlight=False): pass
[docs] @staticmethod @abc.abstractmethod def crop_tokens(original, config, tokens, edge_left = None, edge_right = None): pass