import csv
import json
import logging
import pickle
import shutil
from pathlib import Path
from typing import Any, List
from bs4.dammit import UnicodeDammit
def _open_for_reading(file: Path, binary=False):
if binary:
return open(str(file), 'rb')
else:
return open(str(file), 'r', encoding=FileIO.get_encoding(file))
##########################################################################################
[docs]class FileIO(object):
"""
Various file IO helper methods.
"""
log = logging.getLogger(f'{__name__}.FileIO')
cacheRoot = None
[docs] @classmethod
def cachePath(cls, name: str = ''):
if cls.cacheRoot is None:
raise ValueError('FileIO.cacheRoot must be set before using FileIO.cachePath()!')
path = cls.cacheRoot.joinpath(name)
cls.ensure_directories(path)
return path
[docs] @classmethod
def imageCache(cls, name: str = None):
if name:
return cls.cachePath(f'images/{name}')
else:
return cls.cachePath('images')
@classmethod
def _csv_header(cls, k: int) -> List[str]:
header = ['Gold', 'Original', 'Hyphenated', 'Discarded']
for n in range(1, k+1):
header += [f'{n}-best', f'{n}-best prob.']
header += ['Bin', 'Heuristic', 'Selection']
header += ['Token type', 'Token info', 'Doc ID', 'Index']
header += ['Annotation info', 'Has error']
cls.log.debug(f'header for k={k}: {header}')
return header
[docs] @classmethod
def get_encoding(cls, file: Path) -> str:
"""
Get encoding of a text file.
:param file: A path to a text file.
:return: The encoding of the file, eg. 'utf-8', 'Windows-1252', etc.
"""
with open(str(file), 'rb') as f:
dammit = UnicodeDammit(f.read(1024*500), ['utf-8', 'Windows-1252'])
#cls.log.debug(f'detected {dammit.original_encoding} for {file}')
return dammit.original_encoding
[docs] @classmethod
def ensure_new_file(cls, path: Path):
"""
Moves a possible existing file out of the way by adding a numeric counter before the extension.
:param path: The path to check.
"""
counter = 0
originalpath = path
while Path(path).is_file():
path = Path(
path.parent,
f'{originalpath.stem}.{counter:03n}{originalpath.suffix}'
)
counter += 1
if counter > 0:
cls.log.info(f'Existing file moved to {path}')
originalpath.rename(path)
[docs] @classmethod
def ensure_directories(cls, path: Path):
"""
Ensures that the entire path exists.
:param path: The path to check.
"""
if path.is_file():
path = path.parent
path.mkdir(parents=True, exist_ok=True)
[docs] @classmethod
def copy(cls, src: Path, dest: Path):
"""
Copies a file.
:param src: Source-path.
:param dest: Destination-path.
"""
cls.log.info(f'Copying {src} to {dest}')
shutil.copy(str(src), str(dest))
[docs] @classmethod
def delete(cls, path: Path):
"""
Deletes a file.
:param path: The path to delete.
"""
if path.exists():
path.unlink()
[docs] @classmethod
def save(cls, data: Any, path: Path, backup=True):
"""
Saves data into a file. The extension determines the method of saving:
- `.pickle` -- uses :mod:`pickle`.
- `.json` -- uses :mod:`json`.
- `.csv` -- uses :class:`csv.DictWriter` (assumes data is list of :func:`vars()`-capable
objects). The keys of the first object determines the header.
Any other extension will simply :func:`write()` the data to the file.
:param data: The data to save.
:param path: The path to save to.
:param backup: Whether to move existing files out of the way via :meth:`ensure_new_file`
"""
from ._codecs import COCRJSONCodec
from .tokens.list import TokenList
binary = False
if path.suffix == '.pickle':
binary = True
if backup:
cls.ensure_new_file(path)
if binary:
fopen = lambda: open(str(path), 'wb')
else:
fopen = lambda: open(str(path), 'w', encoding='utf-8')
with fopen() as f:
if path.suffix == '.pickle':
pickle.dump(data, f)
elif path.suffix == '.json':
json.dump(data, f, cls=COCRJSONCodec)
elif path.suffix == '.csv':
if isinstance(data, TokenList):
header = cls._csv_header(data[0].k)
rows = [vars(x) for x in data]
else:
header = data[0].keys()
rows = data
writer = csv.DictWriter(f, header, delimiter='\t', extrasaction='ignore')
writer.writeheader()
writer.writerows(rows)
else:
f.write(data)
[docs] @classmethod
def load(cls, path: Path, default=None):
"""
Loads data from a file. The extension determines the method of saving:
- `.pickle` -- uses :mod:`pickle`.
- `.json` -- uses :mod:`json`.
- `.csv` -- uses :class:`csv.DictReader`.
Any other extension will simply :func:`read()` the data from the file.
:param path: The path to load from.
:param default: If file doesn't exist, return default instead.
:return: The data from the file, or the default.
"""
from ._codecs import COCRJSONCodec
binary = False
if path.suffix == '.pickle':
binary = True
if not path.is_file():
return default
with _open_for_reading(path, binary=binary) as f:
if path.suffix == '.pickle':
return pickle.load(f)
elif path.suffix == '.json':
return json.load(f, object_hook=COCRJSONCodec.object_hook)
elif path.suffix == '.csv':
return list(csv.DictReader(f, delimiter='\t'))
else:
return f.read()