Source code for pagexml.analysis.stats

from typing import Dict, List, Union

import numpy as np

import pagexml.analysis.layout_stats as layout_stats
import pagexml.analysis.text_stats as text_stats
import pagexml.model.physical_document_model as pdm


[docs] def derive_boundary_points(pagexml_doc: pdm.PageXMLTextRegion) -> List[int]: bin_width = pagexml_doc.coords.width / 5 return [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]
def _init_doc_stats(line_width_boundary_points: List[int], word_length_bin_size: int = 5, max_word_length: int = 30) -> Dict[str, List[any]]: fields = ['doc_id', 'doc_num', 'doc_width', 'doc_height', 'lines', 'words', 'text_regions', 'columns', 'extra', 'pages', 'num_words', 'num_number_words', 'num_title_words', 'num_non_title_words', 'num_stop_words', 'num_punctuation_words', 'num_oversized_words'] doc_stats = {field: [] for field in fields} for cat_wpl in text_stats.wpl_cat_range: doc_stats[f"words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = [] for length_bin in range(word_length_bin_size, max_word_length + 1, word_length_bin_size): doc_stats[f"num_words_length_{length_bin}"] = [] for width_range in layout_stats.get_boundary_width_ranges(line_width_boundary_points): doc_stats[f"line_width_range_{width_range}"] = [] return doc_stats
[docs] def get_doc_stats(pagexml_docs: Union[pdm.PageXMLTextRegion, List[pdm.PageXMLTextRegion]], line_width_boundary_points: List[int] = None, stop_words: List[str] = None, max_word_length: int = 30, doc_num: int = None, use_re_word_boundaries: bool = False, line_bin_width: int = 300, max_bin: int = 3000) -> Dict[str, List[any]]: """Generate basic statistics for a PageXML scan object (number of text regions, lines, words, etc.). Line widths are categorised based on a list of boundary points that determine the width of each bin. If no boundary points are passed, a set of boundary points is generated based on the width of the pagexml_doc. :param pagexml_docs: a PageXML document object or a list of PageXML document objects :type pagexml_docs: PageXMLTextRegion :param line_width_boundary_points: a list of points indicating boundaries between categories of line widths :type line_width_boundary_points: List[int] :param stop_words: a list of stopwords to include in number of stopwords the scan statistics :type stop_words: List[str], :param max_word_length: max word length above which words are considered oversized :type max_word_length: int :param doc_num: the number of a doc in a sequence of docs :type doc_num: int :param use_re_word_boundaries: flag whether to use RegEx word boundaries for word count :type use_re_word_boundaries: bool :param line_bin_width: width of line bins, to aggregate lines of different lengths :type line_bin_width: int :param max_bin: max line width bin :type max_bin: int :return: a dictionary with scan statistics :rtype: Dict[str, int] """ if line_width_boundary_points is None: line_width_boundary_points = [point for point in range(line_bin_width, max_bin, line_bin_width)] pagexml_doc_stats = _init_doc_stats(line_width_boundary_points, max_word_length=max_word_length) if isinstance(pagexml_docs, pdm.PageXMLTextRegion): pagexml_docs = [pagexml_docs] for pi, pagexml_doc in enumerate(pagexml_docs): pagexml_doc_stats['doc_id'].append(pagexml_doc.id) pagexml_doc_stats['doc_num'].append(pi + 1) pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width) pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height) lines = [line for line in pagexml_doc.get_lines() if line.text is not None] words = text_stats.get_doc_words(pagexml_doc, use_re_word_boundaries=use_re_word_boundaries) word_stats = text_stats.get_word_cat_stats(words, stop_words=stop_words, max_word_length=max_word_length) wpl_stats = text_stats.get_words_per_line(lines) for field in pagexml_doc.stats: pagexml_doc_stats[field].append(pagexml_doc.stats[field]) for word_cat in word_stats: pagexml_doc_stats[word_cat].append((word_stats[word_cat])) for wpl_cat in text_stats.wpl_cat_range.values(): pagexml_doc_stats[f'words_per_line_{wpl_cat}'].append(wpl_stats[wpl_cat]) if line_width_boundary_points is None: bin_width = pagexml_doc.coords.width / 5 line_width_boundary_points = [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)] line_width_stats = layout_stats.get_line_width_stats(lines, line_width_boundary_points) for line_width_range in line_width_stats: pagexml_doc_stats[f'line_width_range_{line_width_range}'].append(line_width_stats[line_width_range]) return pagexml_doc_stats