Source code for CorrectOCR.dictionary

import logging
from pathlib import Path
from typing import Set

from .fileio import FileIO

[docs]class Dictionary(Set[str]): """ Set of words to use for determining correctness of :class:`Tokens<CorrectOCR.tokens.Token>` and suggestions. """ log = logging.getLogger(f'{__name__}.Dictionary') def __init__(self, path: Path = None, ignoreCase: bool = False): """ :param path: A path for loading a previously saved dictionary. :param ignoreCase: Whether the dictionary is case sensitive. """ super().__init__() self.ignoreCase = ignoreCase self._path = path if self._path and self._path.is_file():'Loading dictionary from {}') for line in FileIO.load(self._path).split('\n'): if self.ignoreCase: self.add(line.lower(), nowarn=True) else: self.add(line, nowarn=True)'{len(self)} words in dictionary') def __repr__(self) -> str: return f'<{self.__class__.__name__} "{len(self)}{" ignoreCase" if self.ignoreCase else ""}>' def __contains__(self, word: str) -> bool: if word.isnumeric(): return True if self.ignoreCase: word = word.lower() return super().__contains__(word)
[docs] def clear(self):'Clearing dictionary at {self._path}.') FileIO.ensure_new_file(self._path) super().clear()
[docs] def add(self, word: str, nowarn: bool = False): """ Add a new word to the dictionary. Silently drops non-alpha strings. :param word: The word to add. :param nowarn: Don't warn about long words (>15 letters). """ word = word.strip() if word == '' or not word.isalpha() or word in self: return if len(word) > 15 and not nowarn: Dictionary.log.warning(f'Added word is more than 15 characters long: {word}') if self.ignoreCase: word = word.lower() return super().add(word)
[docs] def save(self, path: Path = None): """ Save the dictionary. :param path: Optional new path to save to. """ path = path or self._path'Saving dictionary (words: {len(self)}) to {path}')'\n'.join(sorted(self, key=str.lower)), path)