Source code for textdirectory.crudespellchecker

# -*- coding: utf-8 -*-

"""Spellchecker module."""
import gzip
import os
import pickle
import re
from collections import Counter
from pathlib import Path

from bs4 import BeautifulSoup


[docs]class CrudeSpellChecker(): """A very simple and crude spellchecker based on Peter Norvig's design. Simple Language Models: crudesc_lm_en.gz.lm English based on COCA (sample), OANC (written), BNC crudesc_lm_ame.lm American English based on COCA (sample) and OANC (written) crudesc_lm_amehistorical.lm American English based on COHA (sample) """ def __init__(self, caching=True, language_model='crudesc_lm_en'): """ :param caching: caching of corrections :type caching: bool :param language_model: the name of the lm :type language_model: str """ self.caching = caching self.cache = {} self.language_model_name = language_model model_path = Path(f'{os.path.join(os.path.dirname(__file__))}/data/language_models/' f'{self.language_model_name}.gz.lm') print(model_path) with gzip.open(model_path, 'rb') as lm: self.frequencies = pickle.load(lm)
[docs] def p_word(self, word): """ :param word: a word :type word: str """ return self.frequencies[word] / sum(self.frequencies.values())
[docs] def correction(self, word): """ :param word: a word :type word: str :return: most probable spelling correction for word """ # Preserve word_isupper = word[0].isupper() word = word.lower() def reconstruct_case(word, word_isupper): """ :param word: the word :type word: str :param word_isupper: the initial capitalization :type word_isupper: bool :return: the word with its initial capitalization """ if word_isupper: return word.capitalize() else: return word if word in self.cache: return reconstruct_case(self.cache[word], word_isupper) else: correction = max(self.candidates(word), key=self.p_word) if self.caching: if correction not in self.cache: self.cache[word] = correction return reconstruct_case(correction, word_isupper)
[docs] def candidates(self, word): """ :param word: a word :type word: str :return: a list of candidates """ return (self.known([word]) or self.known(self.edit_distance_1(word)) or self.known(self.edit_distance_2(word)) or [word])
[docs] def known(self, words): """ :param word: a word :type word: str :return: a subset of words in the dictionary of frequencies """ return set(w for w in words if w in self.frequencies)
[docs] def edit_distance_1(self, word): """ :param word: a word :type word: str :return: all edits one edit away from the word """ letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts)
[docs] def edit_distance_2(self, word): """ :param word: a word :type word: str :return: all edits two edits away from the word """ return (e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1))
[docs] def correct_string(self, string, return_corrections=False): """ :param string: the string to correct. :type string: str :param return_corrections: include the corrections in the result :type return_corrections: bool :return: the corrected string """ corrections = [] corrected = [] for word in string.split(): corrected_word = self.correction(re.findall(r'\w+', word)[0]) corrected_word = re.sub(r'(.*?)(\w+)(.*?)', f'\g<1>{corrected_word}\g<3>', word) corrected.append(corrected_word) if return_corrections and corrected_word != word: corrections.append((word, corrected_word)) if return_corrections: return (' '.join(corrected), corrections) else: return ' '.join(corrected)
[docs]def generate_crudespellchecker_lm(corpus_directory, model_name, strip_xml=False): """ :param corpus_directory: path the folder containing the files. :type corpus_directory: str :param model_name: th name of the model :type model_name: str :param strip_xml: stripping XML tags with bs4 :type strip_xml: bool """ frequencies = Counter() files = list(Path(corpus_directory).glob('*.txt')) for file in files: with open(file, 'r', errors='ignore') as file: if strip_xml: soup = BeautifulSoup(file.read().lower(), 'lxml') text = soup.get_text() else: text = file.read().lower() file_frequency = Counter(re.findall(r'\b[^\d\W]+\b', text)) frequencies = frequencies + file_frequency with gzip.open(model_name + '.gz.lm', 'wb') as pkl: pickle.dump(frequencies, pkl)