Source code for CorrectOCR.tokens._super

from __future__ import annotations

import abc
import collections
import json
import logging
import string
from pathlib import Path
from typing import Any, DefaultDict, List, NamedTuple, Optional

import nltk
import regex

from .list import TokenList
from ..heuristics import Bin


[docs]def tokenize_str(data: str, language='english') -> List[str]: return nltk.tokenize.word_tokenize(data, language.lower())
##########################################################################################
[docs]class KBestItem(NamedTuple): candidate: str = '' probability: float = 0.0 def __repr__(self) -> str: return f'<KBestItem {self.candidate}, {self.probability:.2e}>'
##########################################################################################
[docs]class Token(abc.ABC): """ Abstract base class. Tokens handle single words. ... """ _subclasses = dict()
[docs] @staticmethod def register(cls): """ Decorator which registers a :class:`Token` subclass with the base class. :param cls: Token subclass """ Token._subclasses[cls.__name__] = cls return cls
_punctuation_splitter = regex.compile(r'^(\p{punct}*)(.*?)(\p{punct}*)$') def __init__(self, original: str, docid: str, index: int): """ :param original: Original spelling of the token. :param docid: The doc with which the Token is associated. """ if type(self) is Token: raise TypeError("Token base class cannot not be directly instantiated") m = Token._punctuation_splitter.search(original) (self._punct_prefix, self.normalized, self._punct_suffix) = m.groups('') self.docid = docid #: The doc with which the Token is associated. self.index = index #: The placement of the Token in the doc. self.gold = None self.bin: Optional[Bin] = None #: Heuristics bin. self.kbest: DefaultDict[int, KBestItem] = collections.defaultdict(KBestItem) """ Dictionary of *k*-best suggestions for the Token. They are keyed with a numerical index starting at 1, and the values are instances of :class:`KBestItem`. """ self.decision: Optional[str] = None #: The decision that was made when :attr:`gold` was set automatically. self.selection: Any = None #: The selected automatic correction for the :attr:`decision`. self.is_hyphenated = False #: Whether the token is hyphenated to the following token. self.is_discarded = False #: Whether the token has been discarded (marked irrelevant by code or annotator). if self.is_punctuation(): #self.__class__.log.debug(f'{self}: is_punctuation') self._gold = self.normalized @property @abc.abstractmethod def token_info(self) -> Any: """ :return: """ return None @property def original(self) -> str: """ The original spelling of the Token. """ return f'{self._punct_prefix}{self.normalized}{self._punct_suffix}' @property def gold(self) -> str: """ The corrected spelling of the Token. """ return f'{self._punct_prefix}{self._gold}{self._punct_suffix}' if self._gold is not None else None @gold.setter def gold(self, gold): self._gold = gold if self._gold: self._gold = self._gold.lstrip(string.punctuation).rstrip(string.punctuation) @property def k(self) -> int: """ The number of *k*-best suggestions for the Token. """ return len(self.kbest) def __str__(self): return f'<{self.__class__.__name__} "{self.original}" "{self.gold}" {self.kbest} {self.bin}>' def __repr__(self): return self.__str__() def __eq__(self, other): if isinstance(other, self.__class__): return self.original.__eq__(other.original) elif isinstance(other, str): return self.original.__eq__(other) else: return NotImplemented def __lt__(self, other): if isinstance(other, self.__class__): return self.original.__lt__(other.original) elif isinstance(other, str): return self.original.__lt__(other) else: return NotImplemented def __hash__(self): return self.original.__hash__() _is_punctuationRE = regex.compile(r'^\p{punct}+$')
[docs] def is_punctuation(self) -> bool: """ Is the Token purely punctuation? """ #self.__class__.log.debug(f'{self}') return Token._is_punctuationRE.match(self.original)
[docs] def is_numeric(self) -> bool: """ Is the Token purely numeric? """ return self.original.isnumeric()
@property def __dict__(self): output = { 'Gold': self.gold or '', 'Original': self.original, 'Doc ID': self.docid, 'Index': self.index, 'Hyphenated': self.is_hyphenated, 'Discarded': self.is_discarded, } for k, item in self.kbest.items(): output[f'{k}-best'] = item.candidate output[f'{k}-best prob.'] = item.probability if self.bin: output['Bin'] = self.bin.number or -1 output['Heuristic'] = self.bin.heuristic output['Decision'] = self.decision output['Selection'] = self.selection output['Token type'] = self.__class__.__name__ output['Token info'] = json.dumps(self.token_info) return output
[docs] @classmethod def from_dict(cls, d: dict) -> Token: """ Initialize and return a new Token with values from a dictionary. :param d: A dictionary of properties for the Token """ if not isinstance(d, collections.Mapping): raise ValueError(f'Object is not dict-like: {d}') classname = d['Token type'] #self.__class__.log.debug(f'from_dict: {d}') t = Token._subclasses[classname]( json.loads(d['Token info']), d.get('Doc ID', None), d.get('Index', -1) ) t.gold = d.get('Gold', None) t.is_hyphenated = d.get('Hyphenated', False) t.is_discarded = d.get('Discarded', False) kbest = collections.defaultdict(lambda: KBestItem('')) k = 1 while f'{k}-best' in d: candidate = d[f'{k}-best'] if candidate == '': break probability = d[f'{k}-best prob.'] kbest[k] = KBestItem(candidate, float(probability)) k += 1 t.kbest = kbest if 'Bin' in d and d['Bin'] not in ('', '-1', -1): from ..heuristics import Heuristics t.bin = Heuristics.bin(int(d['Bin'])) t.bin.heuristic = d['Heuristic'] t.decision = d['Decision'] t.selection = d['Selection'] #t.__class__.log.debug(t) return t
##########################################################################################
[docs]class Tokenizer(abc.ABC): """ Abstract base class. The `Tokenizer` subclasses handle extracting :class:`Token` instances from a document. """ log = logging.getLogger(f'{__name__}.Tokenizer') _subclasses = dict()
[docs] @staticmethod def register(extensions: List[str]): """ Decorator which registers a :class:`Tokenizer` subclass with the base class. :param extensions: List of extensions that the subclass will handle """ def wrapper(cls): for ext in extensions: Tokenizer._subclasses[ext] = cls return cls return wrapper
[docs] @staticmethod def for_extension(ext: str) -> TokenList.__class__: """ Obtain the suitable subclass for the given extension. Currently, Tokenizers are provided for the following extensions: - ``.txt`` -- plain old text. - ``.pdf`` -- assumes the PDF contains images and OCRed text. - ``.tiff`` -- will run OCR on the image and generate a PDF. - ``.png`` -- will run OCR on the image and generate a PDF. :param ext: Filename extension (including leading period). :return: A Tokenizer subclass. """ Tokenizer.log.debug(f'_subclasses: {Tokenizer._subclasses}') return Tokenizer._subclasses[ext]
def __init__(self, language, dehyphenate): """ :type language: :class:`pycountry.Language` :param language: The language to use for tokenization (for example, the `.txt` tokenizer internally uses nltk whose tokenizers function best with a language parameter). """ self.language = language self.dehyphenate = dehyphenate self.tokens = []
[docs] @abc.abstractmethod def tokenize(self, file: Path, storageconfig) -> TokenList: """ Generate tokens for the given document. :param storageconfig: Storage configuration (database, filesystem) for resulting Tokens :param file: A given document. :return: """ pass
[docs] @staticmethod @abc.abstractmethod def apply(original: Path, tokens: TokenList, corrected: Path): pass
[docs] @staticmethod @abc.abstractmethod def crop_tokens(original, config, tokens, edge_left = None, edge_right = None): pass