Source code for pagexml.analysis.stats

from typing import Dict, List, Union

import numpy as np

import pagexml.analysis.layout_stats as layout_stats
import pagexml.analysis.text_stats as text_stats
import pagexml.model.physical_document_model as pdm



[docs]
def derive_boundary_points(pagexml_doc: pdm.PageXMLTextRegion) -> List[int]:
    bin_width = pagexml_doc.coords.width / 5
    return [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]



def _init_doc_stats(line_width_boundary_points: List[int],
                    word_length_bin_size: int = 5, max_word_length: int = 30) -> Dict[str, List[any]]:
    fields = ['doc_id', 'doc_num', 'doc_width', 'doc_height',
              'lines', 'words', 'text_regions',
              'columns', 'extra', 'pages',
              'num_words', 'num_number_words', 'num_title_words', 'num_non_title_words',
              'num_stop_words', 'num_punctuation_words', 'num_oversized_words']
    doc_stats = {field: [] for field in fields}
    for cat_wpl in text_stats.wpl_cat_range:
        doc_stats[f"words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = []
    for length_bin in range(word_length_bin_size, max_word_length + 1, word_length_bin_size):
        doc_stats[f"num_words_length_{length_bin}"] = []
    for width_range in layout_stats.get_boundary_width_ranges(line_width_boundary_points):
        doc_stats[f"line_width_range_{width_range}"] = []
    return doc_stats



[docs]
def get_doc_stats(pagexml_docs: Union[pdm.PageXMLTextRegion, List[pdm.PageXMLTextRegion]],
                  line_width_boundary_points: List[int] = None,
                  stop_words: List[str] = None,
                  max_word_length: int = 30, doc_num: int = None,
                  use_re_word_boundaries: bool = False,
                  line_bin_width: int = 300, max_bin: int = 3000) -> Dict[str, List[any]]:
    """Generate basic statistics for a PageXML scan object (number of text regions, lines,
    words, etc.).

    Line widths are categorised based on a list of boundary points that determine the width of
    each bin. If no boundary points are passed, a set of boundary points is generated based on
    the width of the pagexml_doc.

    :param pagexml_docs: a PageXML document object or a list of PageXML document objects
    :type pagexml_docs: PageXMLTextRegion
    :param line_width_boundary_points: a list of points indicating boundaries between categories of
        line widths
    :type line_width_boundary_points: List[int]
    :param stop_words: a list of stopwords to include in number of stopwords the scan statistics
    :type stop_words: List[str],
    :param max_word_length: max word length above which words are considered oversized
    :type max_word_length: int
    :param doc_num: the number of a doc in a sequence of docs
    :type doc_num: int
    :param use_re_word_boundaries: flag whether to use RegEx word boundaries for word count
    :type use_re_word_boundaries: bool
    :param line_bin_width: width of line bins, to aggregate lines of different lengths
    :type line_bin_width: int
    :param max_bin: max line width bin
    :type max_bin: int
    :return: a dictionary with scan statistics
    :rtype: Dict[str, int]
    """
    if line_width_boundary_points is None:
        line_width_boundary_points = [point for point in range(line_bin_width, max_bin, line_bin_width)]
    pagexml_doc_stats = _init_doc_stats(line_width_boundary_points, max_word_length=max_word_length)
    if isinstance(pagexml_docs, pdm.PageXMLTextRegion):
        pagexml_docs = [pagexml_docs]
    for pi, pagexml_doc in enumerate(pagexml_docs):
        pagexml_doc_stats['doc_id'].append(pagexml_doc.id)
        pagexml_doc_stats['doc_num'].append(pi + 1)
        pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width)
        pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height)
        lines = [line for line in pagexml_doc.get_lines() if line.text is not None]
        words = text_stats.get_doc_words(pagexml_doc, use_re_word_boundaries=use_re_word_boundaries)
        word_stats = text_stats.get_word_cat_stats(words, stop_words=stop_words,
                                                   max_word_length=max_word_length)
        wpl_stats = text_stats.get_words_per_line(lines)
        for field in pagexml_doc.stats:
            pagexml_doc_stats[field].append(pagexml_doc.stats[field])
        for word_cat in word_stats:
            pagexml_doc_stats[word_cat].append((word_stats[word_cat]))
        for wpl_cat in text_stats.wpl_cat_range.values():
            pagexml_doc_stats[f'words_per_line_{wpl_cat}'].append(wpl_stats[wpl_cat])
        if line_width_boundary_points is None:
            bin_width = pagexml_doc.coords.width / 5
            line_width_boundary_points = [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)]
        line_width_stats = layout_stats.get_line_width_stats(lines, line_width_boundary_points)
        for line_width_range in line_width_stats:
            pagexml_doc_stats[f'line_width_range_{line_width_range}'].append(line_width_stats[line_width_range])
    return pagexml_doc_stats