Source code for pagexml.analysis.layout_stats

from collections import Counter
from collections import defaultdict
from typing import Dict, Generator, List, Tuple, Union

import numpy as np

import pagexml.model.physical_document_model as pdm
import pagexml.parser as pagexml_parser
from pagexml.model.physical_document_model import in_same_column



[docs]
def get_baseline_y(line: pdm.PageXMLTextLine) -> List[int]:
    """Return the Y/vertical coordinates of a text line's baseline."""
    if line_starts_with_big_capital(line):
        return [point[1] for point in line.baseline.points if point[1] < line.baseline.bottom - 20]
    else:
        return [point[1] for point in line.baseline.points]




[docs]
def line_starts_with_big_capital(line: pdm.PageXMLTextLine) -> bool:
    """Determine if a line starts with a capital in a larger font than the rest,
    which is aligned at the top, so sticks out at the bottom."""
    # The vertical distance between the lowest and highest baseline point (height) should be large
    if line.baseline.h < 30:
        return False
    lowest_point = find_lowest_point(line)
    # The lowest point should be left-aligned with the sentence.
    return lowest_point[0] - line.baseline.left <= 100




[docs]
def find_lowest_point(line: pdm.PageXMLTextLine) -> Tuple[int, int]:
    """Find the first baseline point that corresponds to the lowest vertical point.

    :param line: a PageXML TextLine object with baseline information
    :type line: PageXMLTextLine
    :return: the left most point that has the lowest vertical coordinate
    :rtype: Tuple[int, int]
    """
    for point in line.baseline.points:
        if point[1] == line.baseline.bottom:
            return point




[docs]
def interpolate_points(p1: Tuple[int, int], p2: Tuple[int, int],
                       step: int = 50) -> Generator[Dict[int, int], None, None]:
    """Determine the x coordinates between a pair of points on a baseline
    and calculate their corresponding y coordinates.

    :param p1: a 2D point
    :type p1: Tuple[int, int]
    :param p2: a 2D point
    :type p2: Tuple[int, int]
    :param step: the step size in pixels for interpolation
    :type step: int
    :return: a generator of interpolated points based on step size
    :rtype: Generator[Dict[int, int], None, None]
    """
    if p1[0] > p2[0]:
        # p2 should be to the right of p1
        p1, p2 = p2, p1
    start_x = p1[0] + step - (p1[0] % step)
    end_x = p2[0] - (p2[0] % step)
    if p2[0] == p1[0]:
        # points 1 and 2 have the same x coordinate
        # so there is nothing to interpolate
        return None
    delta_y = (p1[1] - p2[1]) / (p2[0] - p1[0])
    for int_x in range(start_x, end_x + 1, step):
        int_y = p1[1] - int((int_x - p1[0]) * delta_y)
        yield int_x, int_y




[docs]
def interpolate_baseline_points(points: List[Tuple[int, int]],
                                step: int = 50) -> Dict[int, int]:
    """Determine the x coordinates between each pair of subsequent
    points on a baseline and calculate their corresponding y coordinates.

    :param points: the list of points of a baseline object
    :type points: List[Tuple[int, int]]
    :param step: the step size in pixels for interpolation
    :type step: int
    :return: a dictionary of interpolated points based on step size
    :rtype: Dict[int, int]
    """
    interpolated_baseline_points = {}
    # iterate over each subsequent pair of baseline points
    for ci, curr_point in enumerate(points[:-1]):
        next_point = points[ci + 1]
        if next_point[0] == curr_point[0]:
            # skip pair when they have the same x coordinate
            continue
        # interpolate points between the current and next points using step as size
        for int_x, int_y in interpolate_points(curr_point, next_point, step=step):
            interpolated_baseline_points[int_x] = int_y
    return interpolated_baseline_points




[docs]
def compute_points_distances(points1: List[Tuple[int, int]], points2: List[Tuple[int, int]],
                             step: int = 50):
    if points1 is None or points2 is None:
        return np.array([])
    b1_points = interpolate_baseline_points(points1, step=step)
    b2_points = interpolate_baseline_points(points2, step=step)
    distances = np.array([abs(b2_points[curr_x] - b1_points[curr_x]) for curr_x in b1_points
                          if curr_x in b2_points])
    return distances




[docs]
def compute_baseline_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
                               line2: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
                               step: int = 50) -> np.ndarray:
    """Compute the vertical distance between two baselines, based on
    their horizontal overlap, using a fixed step size. Interpolated
    points will be generated at fixed increments of step size for
    both baselines, so they have points with corresponding x
    coordinates to calculate the distance.

    If two lines have no horizontal overlap, it returns a list with
    a single distance between the average heights of the two baselines

    :param line1: the first line (or list of adjacent lines) in the comparison
    :type line1: PageXMLTextLine
    :param line2: the second line (or list of adjacent lines) in the comparison
    :type line2: PageXMLTextLine
    :param step: the step size in pixels for interpolation
    :type step: int
    :return: a list of vertical distances based on horizontal overlap
    :rtype: List[int]
    """
    if isinstance(line1, pdm.PageXMLTextLine):
        points1 = line1.baseline.points if line1.baseline.points is not None else []
    else:
        points1 = [point for line in line1 for point in line.baseline.points if line.baseline.points is not None]
    if isinstance(line2, pdm.PageXMLTextLine):
        points2 = line2.baseline.points if line2.baseline.points is not None else []
    else:
        points2 = [point for line in line2 for point in line.baseline.points if line.baseline.points is not None]
    distances = compute_points_distances(points1, points2, step=step)
    if len(distances) == 0:
        avg1 = average_baseline_height(line1)
        avg2 = average_baseline_height(line2)
        distances = np.array([abs(avg1 - avg2)])
    return distances




[docs]
def get_bottom_points(line: pdm.PageXMLTextLine) -> List[Tuple[int, int]]:
    right_most = [p for p in line.coords.points if p[0] == line.coords.right][0]
    right_most_index = line.coords.points.index(right_most)
    return line.coords.points[right_most_index:]




[docs]
def compute_bounding_box_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
                                   line2: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]],
                                   step: int = 50):
    points1 = get_bottom_points(line1)
    points2 = get_bottom_points(line2)
    distances = compute_points_distances(points1, points2, step=step)
    return distances




[docs]
def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]]) -> int:
    """Compute the average (mean) baseline height for comparing lines that
    are not horizontally aligned.

    :param line: a TextLine or a list of adjacent lines
    :type line: PageXMLTextLine
    :return: the average (mean) baseline height across all its baseline points
    :rtype: int
    """
    total_avg = 0
    # iterate over each subsequent pair of baseline points
    if isinstance(line, pdm.PageXMLTextLine):
        points = line.baseline.points
    else:
        points = [point for l in line for point in l.baseline.points]
    for ci, curr_point in enumerate(points[:-1]):
        next_point = points[ci + 1]
        segment_avg = (curr_point[1] + next_point[1]) / 2
        # segment contributes its average height times its width
        total_avg += segment_avg * abs(next_point[0] - curr_point[0])
    if total_avg < 0:
        print(f'total_avg: {total_avg}\n')

    # average is total of average heights divided by total width
    x = sorted([point[0] for point in points])
    total_width = (x[-1] - x[0])
    if total_width != 0:
        return int(total_avg / total_width)
    else:
        # this should not happen, but if it does, we need to calculate
        # the average differently, to avoid a division by zero error
        print(f"total_avg={total_avg}")
        print(f"baseline.points[-1][0]={points[-1][0]}")
        xcoords = [p[0] for p in points]
        left_x = min(xcoords)
        right_x = max(xcoords)
        if left_x != right_x:
            return int(total_avg / (right_x - left_x))
        else:
            return int(total_avg)




[docs]
def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine,
                                     debug: int = 0) -> Tuple[List[Tuple[int, int]],
                                                              List[Tuple[int, int]]]:
    """Split the list of bounding polygon coordinates of a line in sets of points above and below
    the baseline. When a line has no baseline or no bounding polygon, empty lists are
    returned

    :param line: a PageXML text line
    :type line: PageXMLTextLine
    :param debug: the detail level of debug information (0 = none, higher is more)
    :type debug: int
    :return: two lists of bounding polygon points
    :rtype: tuple
    """
    ci_c = 0
    below_baseline = []
    above_baseline = []
    if line.baseline is None or line.coords is None:
        return above_baseline, below_baseline
    if not line.baseline or not line.coords:
        return above_baseline, below_baseline
    if line.coords.right < line.baseline.left:
        return above_baseline, below_baseline
    if line.coords.left > line.baseline.right:
        return above_baseline, below_baseline
    interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=50).items()]
    if debug > 2:
        print('baseline_points:', line.baseline.points)
        print('interpolated_baseline_points:', interpolated_baseline_points)
    sorted_coord_points = sorted(line.coords.points, key=lambda p: p[0])
    if debug > 0:
        print('sorted_coord_points:', sorted_coord_points)
        print('len(sorted_coord_points):', len(sorted_coord_points))
    if debug > 1:
        print('ci_c:', ci_c)
    num_baseline_points = len(interpolated_baseline_points)
    num_coord_points = len(sorted_coord_points)
    for ci_b, curr_b in enumerate(interpolated_baseline_points):
        curr_bx, curr_by = curr_b
        next_b = interpolated_baseline_points[ci_b + 1] if ci_b + 1 < num_baseline_points else None
        if debug > 0:
            print(f'sort_above_below - curr_b: {curr_b}')
            print('\tci_c:', ci_c, '\tnum_coord_points:', num_coord_points)
        if ci_c == num_coord_points:
            break
        for curr_c in sorted_coord_points[ci_c:]:
            curr_cx, curr_cy = curr_c
            if next_b and abs(next_b[0] - curr_cx) < abs(curr_b[0] - curr_cx):
                break
            if debug > 0:
                print(f'sort_above_below - curr_c ({ci_c}): {curr_c}')
            ci_c += 1
            if curr_cy < curr_by:
                if debug > 0:
                    print(f'sort_above_below - above')
                above_baseline.append(curr_c)
            else:
                if debug > 0:
                    print(f'sort_above_below - below')
                below_baseline.append(curr_c)

    return above_baseline, below_baseline




[docs]
def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50,
                     ignore_errors: bool = True, debug: int = 0) -> np.array:
    above_baseline, below_baseline = sort_coords_above_below_baseline(line, debug=debug)
    if len(above_baseline) == 0:
        if ignore_errors is False:
            ValueError(f'line {line.id} has no bounding coordinates above baseline')
        return None
    if len(below_baseline) == 0:
        if ignore_errors is False:
            ValueError(f'Warning: line {line.id} has no bounding coordinates below baseline')
    int_base = interpolate_baseline_points(line.baseline.points, step=step)
    int_above = interpolate_baseline_points(above_baseline, step=step)

    height = {}
    for x in int_base:
        if x in int_above:
            height[x] = int_base[x] - int_above[x]

    if len(height) == 0:
        print()
        return None
    return np.array(list(height.values()))




[docs]
def compute_height_stats(line_heights: np.array) -> Dict[str, int]:
    return {
        'max': line_heights.max(),
        'min': line_heights.min(),
        'mean': int(round(line_heights.mean())),
        'median': int(np.median(line_heights))
    }




[docs]
def get_line_height_stats(line: pdm.PageXMLTextLine, step: int = 50,
                          ignore_errors: bool = False, debug: int = 0) -> Union[Dict[str, int], None]:
    try:
        line_heights = get_text_heights(line, step=step, ignore_errors=ignore_errors, debug=debug)
        if debug > 0:
            print('get_line_height_stats - line_heights:', line_heights)
        if line_heights is None:
            return None
        return compute_height_stats(line_heights)
    except IndexError:
        print('ERROR INFO:')
        print('get_line_height_stats - line.baseline:', line.baseline)
        print('get_line_height_stats - line.coords:', line.coords)
        raise
    except AttributeError:
        return None




[docs]
def get_line_distances(lines: List[pdm.PageXMLTextLine]) -> List[np.ndarray]:
    all_distances = []
    for li, curr_line in enumerate(lines):
        next_line = None
        if li + 1 < len(lines):
            next_line = lines[li + 1]
        if next_line:
            if curr_line.baseline and next_line.baseline:
                distances = compute_baseline_distances(curr_line, next_line)
            else:
                distances = compute_bounding_box_distances(curr_line, next_line)
            all_distances.append(distances)
        return all_distances




[docs]
def get_textregion_line_distances(text_region: pdm.PageXMLTextRegion) -> List[np.ndarray]:
    """Returns a list of line distance numpy arrays. For each line, its distance
    to the next at 50 pixel intervals is computed and stored in a numpy ndarray.

    :param text_region: a TextRegion object that contains TextLines
    :type text_region: PageXMLTextRegion
    :return: a list of numpy ndarrays of line distances
    :rtype: List[np.ndarray]
    """
    all_distances: List[np.ndarray] = []
    text_regions = text_region.get_inner_text_regions()
    for ti, curr_tr in enumerate(text_regions):
        above_next_tr = False
        next_tr = None
        if ti + 1 < len(text_regions):
            # check if the next textregion is directly below the current one
            next_tr = text_regions[ti + 1]
            above_next_tr = in_same_column(curr_tr, next_tr)
        for li, curr_line in enumerate(curr_tr.lines):
            next_line = None
            if li + 1 < len(curr_tr.lines):
                next_line = curr_tr.lines[li + 1]
            elif above_next_tr and next_tr.lines:
                # if the next textregion is directly below this one, include the distance
                # of this textregion's last line and the next textregion's first line
                next_line = next_tr.lines[0]
            if next_line:
                distances = compute_baseline_distances(curr_line, next_line)
                all_distances.append(distances)
    return all_distances




[docs]
def get_textregion_avg_line_distance(text_region: pdm.PageXMLTextRegion,
                                     avg_type: str = "macro") -> float:
    """Returns the median distance between subsequent lines in a
    textregion object. If the textregion contains smaller textregions, it only
    considers line distances between lines within the same column (i.e. only
    lines from textregions that are horizontally aligned.)

    By default, the macro-average is returned.

    :param text_region: a TextRegion object that contains TextLines
    :type text_region: PageXMLTextRegion
    :param avg_type: the type of averging to apply (macro or micro)
    :type avg_type: str
    :return: the median distance between horizontally aligned lines
    :rtype: float
    """
    if avg_type not in ["micro", "macro"]:
        raise ValueError(f'Invalid avg_type "{avg_type}", must be "macro" or "micro"')
    all_distances = get_textregion_line_distances(text_region)
    if len(all_distances) == 0:
        return 0
    if avg_type == "micro":
        return float(np.median(np.concatenate(all_distances)))
    else:
        return float(np.median(np.array([distances.mean() for distances in all_distances])))




[docs]
def get_textregion_avg_char_width(text_region: pdm.PageXMLTextRegion) -> float:
    """Return the estimated average (mean) character width, determined as the sum
    of the width of text lines divided by the sum of the number of characters
    of all text lines.

    :param text_region: a TextRegion object that contains TextLines
    :type text_region: PageXMLTextRegion
    :return: the average (mean) character width
    :rtype: float
    """
    total_chars = 0
    total_text_width = 0
    for tr in text_region.get_inner_text_regions():
        for line in tr.lines:
            if line.text is None:
                continue
            total_chars += len(line.text)
            if line.baseline is not None:
                total_text_width += line.baseline.width
            elif line.coords is not None:
                total_text_width += line.coords.width
            else:
                continue
    return total_text_width / total_chars if total_chars else 0.0




[docs]
def get_textregion_avg_line_width(text_region: pdm.PageXMLTextRegion, unit: str = "char") -> float:
    """Return the estimated average (mean) character width, determined as the sum
    of the width of text lines divided by the sum of the number of characters
    of all text lines.

    :param text_region: a TextRegion object that contains TextLines
    :type text_region: PageXMLTextRegion
    :param unit: the unit to measure line width, either char or pixel
    :type unit: str
    :return: the average (mean) character width
    :rtype: float
    """
    if unit not in {'char', 'pixel'}:
        raise ValueError(f'Invalid unit "{unit}", must be "char" (default) or "pixel"')
    total_lines = 0
    total_line_width = 0
    for tr in text_region.get_inner_text_regions():
        for line in tr.lines:
            if line.text is None:
                # skip non-text lines
                continue
            total_lines += 1
            line_width = line.baseline.w if line.baseline is not None else line.coords.w
            total_line_width += len(line.text) if unit == 'char' else line_width
    return total_line_width / total_lines if total_lines > 0 else 0.0




[docs]
def compute_textregion_distance(tr1: pdm.PageXMLTextRegion,
                                tr2: pdm.PageXMLTextRegion) -> Union[int, float]:
    if pdm.is_vertically_overlapping(tr1, tr2):
        return 0
    elif tr1.coords.top > tr2.coords.top:
        tr1, tr2 = tr2, tr1
    if len(tr1.lines) > 0 and len(tr2.lines) > 0:
        prev_line = tr1.lines[-1]
        curr_line = tr2.lines[0]
        distances = compute_baseline_distances(prev_line, curr_line)
        return float(np.median(distances))
    else:
        return tr2.coords.top - tr1.coords.bottom




[docs]
def compute_lines_stats(lines: List[pdm.PageXMLTextLine],
                        stats: Dict[str, Dict[str, Counter]]) -> None:
    prev_line = None
    for curr_line in sorted(lines):
        stats["line"]["height"].update([curr_line.coords.h])
        stats["line"]["width"].update([curr_line.coords.w])
        stats["line"]["words"].update([curr_line.num_words])
        if isinstance(prev_line, pdm.PageXMLTextLine):
            distances = compute_baseline_distances(prev_line, curr_line)
            if len(distances) == 0:
                continue
            try:
                stats["line"]["distance"].update([np.median(distances)])
            except TypeError:
                print(prev_line.baseline)
                print(curr_line.baseline)
                print(distances, type(distances))
                raise
        prev_line = curr_line




[docs]
def compute_textregions_stats(text_regions: List[pdm.PageXMLTextRegion],
                              stats: Dict[str, Dict[str, Counter]]) -> None:
    prev_tr = None
    for curr_tr in sorted(text_regions):
        if isinstance(prev_tr, pdm.PageXMLTextRegion) and pdm.is_horizontally_overlapping(curr_tr, prev_tr):
            tr_dist = compute_textregion_distance(prev_tr, curr_tr)
            stats["textregion"]["vertical_dist"].update([tr_dist])
        stats["textregion"]["height"].update([curr_tr.coords.h])
        stats["textregion"]["width"].update([curr_tr.coords.w])
        tr_stats = curr_tr.stats
        for field in tr_stats:
            if field == "text_regions":
                continue
            stats["textregion"][f"{field}"].update([tr_stats[field]])
        if len(curr_tr.lines) > 0:
            compute_lines_stats(curr_tr.lines, stats)
        prev_tr = curr_tr




[docs]
def compute_columns_stats(columns: List[pdm.PageXMLColumn],
                          stats: Dict[str, Dict[str, Counter]]):
    for column in columns:
        stats["column"]["height"].update([column.coords.h])
        stats["column"]["width"].update([column.coords.w])
        column_stats = column.stats
        for field in column_stats:
            stats["column"][f"{field}"].update([column_stats[field]])
        if len(column.text_regions) > 0:
            compute_textregions_stats(column.text_regions, stats)
    return stats




[docs]
def compute_pages_stats(pages: List[pdm.PageXMLPage], stats: Dict[str, Dict[str, Counter]]):
    for page in pages:
        stats["page"]["height"].update([page.coords.h])
        stats["page"]["width"].update([page.coords.w])
        page_stats = page.stats
        for field in page_stats:
            stats["page"][f"{field}"].update([page_stats[field]])
        if len(page.columns) > 0:
            compute_columns_stats(page.columns, stats)
        if len(page.text_regions) > 0:
            compute_textregions_stats(page.text_regions, stats)
    return stats




[docs]
def compute_scans_stats(scans: List[pdm.PageXMLScan], stats: Dict[str, Dict[str, Counter]]):
    for scan in scans:
        stats["scan"]["height"].update([scan.coords.h])
        stats["scan"]["width"].update([scan.coords.w])
        scan_stats = scan.stats
        for field in scan_stats:
            if field in {'columns', 'extra', 'pages'}:
                continue
            stats["scan"][f"{field}"].update([scan_stats[field]])
        if len(scan.pages) > 0:
            compute_pages_stats(scan.pages, stats)
        if len(scan.columns) > 0:
            compute_columns_stats(scan.columns, stats)
        if len(scan.text_regions) > 0:
            compute_textregions_stats(scan.text_regions, stats)
    return stats




[docs]
def compute_pagexml_stats(docs: List[pdm.PageXMLDoc]) -> Dict[str, Dict[str, Counter]]:
    """Compute statistics on the numbers of PageXML elements that are part of a given
    list of PageXMLDoc objects.

    :param docs: a list of PageXMLDoc objects
    :type docs: List[PageXMLDoc]
    :return: A nested dictionary of statistic per PageXML element type
    :rtype: Dict[str, Dict[str, Counter]]
    """
    stats = defaultdict(lambda: defaultdict(Counter))
    type_docs = defaultdict(list)
    for doc in docs:
        type_docs[doc.__class__.__name__].append(doc)
    for doc_type in type_docs:
        if doc_type == 'PageXMLScan':
            compute_scans_stats(type_docs[doc_type], stats)
        elif doc_type == 'PageXMLPage':
            compute_pages_stats(type_docs[doc_type], stats)
        elif doc_type == 'PageXMLColumn':
            compute_columns_stats(type_docs[doc_type], stats)
        elif doc_type == 'PageXMLTextRegion':
            compute_textregions_stats(type_docs[doc_type], stats)
        elif doc_type == 'PageXMLTextLine':
            compute_lines_stats(type_docs[doc_type], stats)
    return stats




[docs]
def get_line_widths(pagexml_files: List[Union[str, pdm.PageXMLTextRegion]] = None,
                    line_width_bin_size: int = 50) -> List[int]:
    """Return a list of line widths for the lines in a list of PageXML files.

    :param pagexml_files: a list of PageXML filepaths
    :type pagexml_files: List[str]
    :param line_width_bin_size: the bin size for grouping lines (default is 50 pixels)
    :type line_width_bin_size: int
    :return: a list of line widths
    :rtype: List[int]
    """
    line_widths = []
    for pagexml_file in pagexml_files:
        if isinstance(pagexml_file, str):
            scan = pagexml_parser.parse_pagexml_file(pagexml_file=pagexml_file)
            lines = scan.get_lines()
            line_widths += [int(line.coords.w / line_width_bin_size) * line_width_bin_size for line in lines]
        elif isinstance(pagexml_file, pdm.PageXMLTextRegion):
            lines = pagexml_file.get_lines()
            line_widths += [int(line.coords.w / line_width_bin_size) * line_width_bin_size for line in lines]
    return line_widths




[docs]
def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = 50,
                                    min_ratio: float = 0.25) -> List[int]:
    """Find the minima in the distribution of line widths relative to the peaks in the distribution.
    These minima represent the boundaries between clusters of lines within the same line width
    intervals.

    :param line_widths: a list of PageXML text line widths
    :type line_widths: List[int]
    :param line_bin_size: the bin size for grouping lines to establish the line width distribution (default 50 pixels)
    :type line_bin_size: int
    :param min_ratio: the minimum ratio between a peak frequency and its neighbouring minimum to determine
        if the minimum is a category boundary
    :type min_ratio: float
    :return: A list of category boundary points
    :rtype: List[int]
    """
    width_freq = Counter(line_widths)
    num_lines = len(line_widths)
    boundary_points = []
    total_widths = sum(width_freq.values())
    max_width = max(width_freq.keys())
    max_freq = max(width_freq.values())
    curr_max_freq = 0
    curr_min_freq = max_freq + 1
    curr_max_width = None
    curr_min_width = None
    prev_freq = 0

    for w in range(0, max_width + 1, line_bin_size):
        f = width_freq[w]
        if f > curr_max_freq:
            # print(f'\tfreq {f} bigger than curr max: {curr_max_freq}')
            curr_max_freq = f
            curr_max_width = w
        if f < prev_freq and f < curr_min_freq:
            # print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}')
            curr_min_freq = f
            curr_min_width = w
        if f / num_lines > 0.01 and f > prev_freq and f > curr_min_freq:
            # print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}')
            # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
            # print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq)
            if (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio:
                boundary_points.append((curr_min_width, curr_min_freq))
                curr_max_freq = 0
                curr_max_width = 0
                curr_min_freq = max_freq + 1
        # print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}"
        #       f"\tcurr_min_freq: {curr_min_freq: >8}"
        #       f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}")
        prev_freq = f
    return [bp[0] for bp in boundary_points]




[docs]
def categorise_line_width(line: pdm.PageXMLTextLine, boundary_points: List[int]) -> str:
    """Categorise a line based on its width and a list of line width boundary points."""
    prev_point = 0
    for boundary_point in boundary_points:
        if boundary_point > line.coords.w:
            return f"{prev_point}-{boundary_point}"
        prev_point = boundary_point
    return f"{prev_point}-"




[docs]
def get_boundary_width_ranges(boundary_points: List[int]) -> List[str]:
    width_ranges = []
    prev_point = 0
    for boundary_point in boundary_points:
        width_range = f"{prev_point}-{boundary_point}"
        width_ranges.append(width_range)
        prev_point = boundary_point
    width_range = f"{prev_point}-"
    width_ranges.append(width_range)
    return width_ranges




[docs]
def get_line_width_stats(lines: List[pdm.PageXMLTextLine], boundary_points: List[int]) -> Counter:
    """Return a Counter object with statistics of the number of lines categorised according
    to a list of category break points (line widths that are the boundary between categories
    of line width).

    :param lines: A list of PageXML text lines
    :type lines: List[PageXMLTextLine]
    :param boundary_points: A list of line width category boundary points
    :type boundary_points: List[int]
    :return: A counter with the number of lines per line width interval
    :rtype: Counter
    """
    line_width_stats = Counter()
    for width_range in get_boundary_width_ranges(boundary_points):
        line_width_stats[width_range] = 0
    line_width_stats.update([categorise_line_width(line, boundary_points) for line in lines])
    return line_width_stats