Source code for textdirectory.helpers

# -*- coding: utf-8 -*-

"""Helpers module."""
import copy
import re

import psutil

from textdirectory import textdirectory, transformations


[docs]def tabulate_flat_list_of_dicts(list_of_dicts, max_length=25): """ :param list_of_dicts: a list of dictionaries; each list is a row :type list_of_dicts: list :param max_length: the maximum length of a cell :type max_length: int :return: a table :type return: str """ # Create a copy of the list to prevent object mutation list_of_dicts = copy.deepcopy(list_of_dicts) if len(list_of_dicts) == 0: return False # Enforce a maximum length if max_length: for row in list_of_dicts: for key, value in row.items(): row[key] = str(value)[:max_length] # Determine the width of the columns longest_values = {} for key in list_of_dicts[0].keys(): longest_values[key] = len(key) for row in list_of_dicts: for key, value in row.items(): value = str(value) if key in longest_values: if len(value) > longest_values[key]: longest_values[key] = len(value) else: longest_values[key] = len(value) # Line / len(longest_values) = additonal characters for pipes length = 0 for key, value in longest_values.items(): length += value line = '\n|' + '-' * (length + len(longest_values) - 1) + '|' # Header based on the first dictionary table = line + '\n|' for key in list_of_dicts[0].keys(): table += f'{key}'.ljust(longest_values[key]) + '|' table += line # Rows for row in list_of_dicts: table += '\n|' for key, value in row.items(): # Remove linebreaks value = value.replace('\n', '') table += str(value).ljust(longest_values[key]) + '|' table += line return table
[docs]def count_non_alphanum(string): """ :param string: a string :type string: str :return: the number of non-alphanumeric characters in the string :type return: int """ non_alphanum = 0 for c in string: if not c.isalpha(): non_alphanum += 1 return non_alphanum
[docs]def chunk_text(string, chunk_size=50000): """ :param string: a string :type string: str :param chunk_size: the max characters of one chunk :type chunk_size: int :return: a list of chunks :type return: list """ chunks = [string[i:i + chunk_size] for i in range(0, len(string), chunk_size)] return chunks
[docs]def estimate_spacy_max_length(override=False, tokenizer_only=False): """Returns a somewhat sensible suggestions for max_length.""" memory = psutil.virtual_memory() gb_available = memory.available / 1024 / 1024 / 1024 # tagger, parser, ner 100,000 characters = 1 GB estimated_max_length = gb_available * 100000 if tokenizer_only: estimated_max_length = estimated_max_length * 3 if override: estimated_max_length = override return estimated_max_length
[docs]def get_human_from_docstring(doc): """ :param doc: if True, also return the 'human name' :type doc: string :return: a dictionary of name_* keys/values from the docstring. :type return: dict """ doc = doc.replace(' ', '') res = re.findall('human_(.*):(.*)', doc) return {k:v.strip() for (k,v) in res}
[docs]def get_available_filters(get_human_name=False): """ :param get_human_name: if True, also return the 'human name' :type get_human_name: bool :return: a list of functions; if get_human_name a list of tuples :type return: list """ available_filters = [filter for filter in dir(textdirectory.TextDirectory) if 'filter_by' in filter] if get_human_name: available_filters_with_human = [] for f in available_filters: doc = getattr(textdirectory.TextDirectory, f).__doc__ human = get_human_from_docstring(doc) if 'name' in human: available_filters_with_human.append((f, human['name'])) else: available_filters_with_human.append((f, f)) available_filters = available_filters_with_human return available_filters
[docs]def get_available_transformations(get_human_name=False): """ :param get_human_name: if True, also return the 'human name' :type string: bool :return: a list of functions; if get_human_name a list of tuples :type return: list """ available_transformations = [transformation for transformation in dir(transformations) if 'transformation' in transformation] if get_human_name: available_transformations_with_human = [] for t in available_transformations: doc = getattr(textdirectory.transformations, t).__doc__ human = get_human_from_docstring(doc) if 'name' in human: available_transformations_with_human.append((t, human['name'])) else: available_transformations_with_human.append((t, t)) available_transformations = available_transformations_with_human return available_transformations