Source code for CorrectOCR.dictionary

import logging
import itertools
import string
from collections import defaultdict
from pathlib import Path
from typing import Set

import progressbar

from ._util import letterRE
from .fileio import FileIO


[docs]class Dictionary(Set[str]): """ Set of words to use for determining correctness of :class:`Tokens<CorrectOCR.tokens.Token>` and suggestions. **Note**: A Dictionary "contains" all "words" that contain at most 1 alphabetic letters, such as ``8,5`` or ``(600)`` or ``A4`` . """ log = logging.getLogger(f'{__name__}.Dictionary') def __init__(self, path: Path = None, ignoreCase: bool = False): """ :param path: A path for loading a previously saved dictionary. :param ignoreCase: Whether the dictionary is case sensitive. """ super().__init__() self.ignoreCase = ignoreCase self._path = path self.groups = defaultdict(set) self._dirty = set() if self._path: if not self._path.is_dir(): FileIO.ensure_directories(self._path) else: Dictionary.log.info(f'Loading dictionary from {self._path}') for file in progressbar.progressbar(self._path.iterdir()): for line in FileIO.load(file).split('\n'): self.add(file.stem, line, nowarn=True, dirty=False) Dictionary.log.info(f'{len(self)} words in dictionary') def __repr__(self) -> str: return f'<{self.__class__.__name__} "{len(self)}{" ignoreCase" if self.ignoreCase else ""}>' def __len__(self) -> int: return len(set().union(*self.groups.values())) #return len([len(group) for group in self.groups.values()]) def __contains__(self, word: str) -> bool: word = self.clean(word) if word == '' or len(letterRE.findall(word)) <= 1: return True if self.ignoreCase: word = word.lower() for group in self.groups.values(): if word in group: return True return False
[docs] def has_group(self, group: str) -> bool: return group in self.groups
[docs] def clear(self): Dictionary.log.info(f'Clearing dictionary at {self._path}.') FileIO.ensure_new_file(self._path) # TODO self.groups = defaultdict(set)
[docs] def add(self, group: str, word: str, nowarn: bool = False, dirty: bool = True): """ Add a new word (sans punctuation) to the dictionary. Silently drops non-alpha strings. :param word: The word to add. :param nowarn: Don't warn about long words (>20 letters). """ word = self.clean(word) if word == '' or not letterRE.search(word): return if ' ' in word: Dictionary.log.info(f'Splitting word with spaces: {word}') for w in word.split(' '): if w not in self: self.add(group, w, nowarn, dirty) return if self.ignoreCase: word = word.lower() if dirty and word not in self.groups[group]: self._dirty.add(group) if len(word) > 20 and not nowarn: Dictionary.log.warning(f'Added word is more than 20 characters long: {word}') return self.groups[group].add(word)
[docs] def save_group(self, group: str): path = self._path.joinpath(group) if len(self.groups[group]) == 0: FileIO.delete(path) else: Dictionary.log.info(f'Saving group (words: {len(self.groups[group])}) to {path}') FileIO.save('\n'.join(sorted(self.groups[group], key=str.lower)), path, backup=False)
[docs] def save(self, path: Path = None): """ Save the dictionary. :param path: Optional new path to save to. """ if path: self._path = path Dictionary.log.info(f'Saving dictionary (total words: {len(self)})') #Dictionary.log.debug(f'Dirty groups: {self._dirty}') for group in self.groups.keys(): if group in self._dirty: self.save_group(group)
[docs] def clean(self, word: str) -> str: word = word.replace('\xad', '') # remove soft hyphens word = word.replace('-', '') # remove hard hyphens word = word.strip(string.punctuation + string.whitespace + '»«“”„›‹') # strip surrounding punctuation and quotation marks return word