Source code for textdirectory.crudespellchecker

# -*- coding: utf-8 -*-

"""Spellchecker module."""
import gzip
import os
import pickle
import re
from collections import Counter
from pathlib import Path

from bs4 import BeautifulSoup


[docs]class CrudeSpellChecker():
    """A very simple and crude spellchecker based on Peter Norvig's design.
    Simple Language Models:
    crudesc_lm_en.gz.lm English based on COCA (sample), OANC (written), BNC
    crudesc_lm_ame.lm American English based on COCA (sample) and OANC (written)
    crudesc_lm_amehistorical.lm American English based on COHA (sample)
    """

    def __init__(self, caching=True, language_model='crudesc_lm_en'):
        """
        :param caching: caching of corrections
        :type caching: bool
        :param language_model: the name of the lm
        :type language_model: str
        """
        self.caching = caching
        self.cache = {}
        self.language_model_name = language_model

        model_path = Path(f'{os.path.join(os.path.dirname(__file__))}/data/language_models/'
                          f'{self.language_model_name}.gz.lm')
        print(model_path)
        with gzip.open(model_path, 'rb') as lm:
            self.frequencies = pickle.load(lm)

[docs]    def p_word(self, word):
        """
        :param word: a word
        :type word: str
        """
        return self.frequencies[word] / sum(self.frequencies.values())

[docs]    def correction(self, word):
        """
        :param word: a word
        :type word: str
        :return: most probable spelling correction for word
        """

        # Preserve
        word_isupper = word[0].isupper()
        word = word.lower()

        def reconstruct_case(word, word_isupper):
            """
            :param word: the word
            :type word: str
            :param word_isupper: the initial capitalization
            :type word_isupper: bool
            :return: the word with its initial capitalization
            """
            if word_isupper:
                return word.capitalize()
            else:
                return word

        if word in self.cache:
            return reconstruct_case(self.cache[word], word_isupper)
        else:
            correction = max(self.candidates(word), key=self.p_word)

            if self.caching:
                if correction not in self.cache:
                    self.cache[word] = correction

            return reconstruct_case(correction, word_isupper)

[docs]    def candidates(self, word):
        """
        :param word: a word
        :type word: str
        :return: a list of candidates
        """
        return (self.known([word]) or self.known(self.edit_distance_1(word)) or
                self.known(self.edit_distance_2(word)) or [word])

[docs]    def known(self, words):
        """
        :param word: a word
        :type word: str
        :return: a subset of words in the dictionary of frequencies
        """
        return set(w for w in words if w in self.frequencies)

[docs]    def edit_distance_1(self, word):
        """
        :param word: a word
        :type word: str
        :return: all edits one edit away from the word
        """
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]

        return set(deletes + transposes + replaces + inserts)

[docs]    def edit_distance_2(self, word):
        """
        :param word: a word
        :type word: str
        :return: all edits two edits away from the word
        """
        return (e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1))

[docs]    def correct_string(self, string, return_corrections=False):
        """
        :param string: the string to correct.
        :type string: str
        :param return_corrections: include the corrections in the result
        :type return_corrections: bool
        :return: the corrected string
        """
        corrections = []
        corrected = []
        for word in string.split():
            corrected_word = self.correction(re.findall(r'\w+', word)[0])
            corrected_word = re.sub(r'(.*?)(\w+)(.*?)', f'\g<1>{corrected_word}\g<3>', word)
            corrected.append(corrected_word)

            if return_corrections and corrected_word != word:
                corrections.append((word, corrected_word))

        if return_corrections:
            return (' '.join(corrected), corrections)
        else:
            return ' '.join(corrected)


[docs]def generate_crudespellchecker_lm(corpus_directory, model_name, strip_xml=False):
    """
    :param corpus_directory: path the folder containing the files.
    :type corpus_directory: str
    :param model_name: th name of the model
    :type model_name: str
    :param strip_xml: stripping XML tags with bs4
    :type strip_xml: bool
    """
    frequencies = Counter()
    files = list(Path(corpus_directory).glob('*.txt'))

    for file in files:
        with open(file, 'r', errors='ignore') as file:
            if strip_xml:
                soup = BeautifulSoup(file.read().lower(), 'lxml')
                text = soup.get_text()
            else:
                text = file.read().lower()

            file_frequency = Counter(re.findall(r'\b[^\d\W]+\b', text))
        frequencies = frequencies + file_frequency

    with gzip.open(model_name + '.gz.lm', 'wb') as pkl:
        pickle.dump(frequencies, pkl)
Source code for textdirectory.crudespellchecker

textdirectory

Navigation

Related Topics