Source code for pagexml.column_parser

import copy
import re
from collections import Counter
from typing import Dict, List, Tuple

import pagexml.helper.pagexml_helper as pagexml_helper
import pagexml.model.physical_document_model as pdm


[docs] def within_column(line: pdm.PageXMLTextLine, column_range: Dict[str, int], overlap_threshold: float = 0.5): """Determine if a given line is within the horizontal range of a column.""" start = max([line.coords.left, column_range["start"]]) end = min([line.coords.right, column_range["end"]]) overlap = end - start if end > start else 0 return overlap / line.coords.width > overlap_threshold
[docs] def find_overlapping_columns(columns: List[pdm.PageXMLColumn]): columns.sort() merge_sets = [] for ci, curr_col in enumerate(columns[:-1]): next_col = columns[ci + 1] if pdm.is_horizontally_overlapping(curr_col, next_col): for merge_set in merge_sets: if curr_col in merge_set: merge_set.append(next_col) break else: merge_sets.append([curr_col, next_col]) return merge_sets
################################################# # Identifying columns using pixel distributions # #################################################
[docs] def compute_pixel_dist(lines: List[pdm.PageXMLTextLine]) -> Counter: """Count how many lines are above each horizontal pixel coordinate.""" pixel_dist = Counter() for line in lines: pixel_dist.update([pixel for pixel in range(line.coords.left, line.coords.right + 1)]) return pixel_dist
[docs] def new_gap_pixel_interval(pixel: int) -> dict: return {"start": pixel, "end": pixel}
[docs] def determine_freq_gap_interval(pixel_dist: Counter, gap_threshold: int) -> list: common_pixels = sorted([pixel for pixel, freq in pixel_dist.items()]) gap_pixel_intervals = [] if len(common_pixels) == 0: return gap_pixel_intervals curr_interval = new_gap_pixel_interval(common_pixels[0]) prev_interval_end = 0 for curr_index, curr_pixel in enumerate(common_pixels[:-1]): next_pixel = common_pixels[curr_index + 1] if next_pixel - curr_pixel < gap_threshold: curr_interval["end"] = next_pixel else: if curr_interval["start"] - prev_interval_end < gap_threshold: continue gap_pixel_intervals += [curr_interval] prev_interval_end = curr_interval["end"] curr_interval = new_gap_pixel_interval(next_pixel) gap_pixel_intervals += [curr_interval] return gap_pixel_intervals
[docs] def find_column_gaps(lines: List[pdm.PageXMLTextLine], gap_threshold: int = 50): gap_pixel_dist = compute_pixel_dist(lines) gap_pixel_intervals = determine_freq_gap_interval(gap_pixel_dist, gap_threshold) return gap_pixel_intervals
[docs] def column_bounding_box_surrounds_lines(column: pdm.PageXMLColumn) -> bool: """Check if the column coordinates contain the coordinate boxes of the column lines.""" for line in column.get_lines(): if not pagexml_helper.elements_overlap(column, line, threshold=0.6): return False return True
[docs] def is_text_column(column: pdm.PageXMLColumn) -> bool: """Check if there is at least one alpha-numeric word on the page.""" # num_chars = 0 num_alpha_words = 0 for line in column.get_lines(): if line.text: try: words = [word for word in re.split(r'\W+', line.text.strip()) if len(word) > 1] except re.error: print(line.text) raise num_alpha_words += len(words) # num_chars += len(line.text) # return num_chars >= 20 return num_alpha_words > 0
[docs] def is_full_text_column(column: pdm.PageXMLColumn, page: pdm.PageXMLTextRegion = None, num_page_cols: int = 2) -> bool: """Check if a page column is a full-text column (running from top to bottom of page).""" between_cols_margin = 300 * (num_page_cols - 1) if page is None and column.parent is not None: page = column.parent if page is None: raise ValueError(f'no information on parent of column {column.id}') lines = page.get_lines() left = min([line.coords.left for line in lines]) right = min([line.coords.right for line in lines]) page_text_width = right - left full_column_text_width = (page_text_width - between_cols_margin) / num_page_cols if column.coords.width < full_column_text_width - 80: # narrow column is not a normal text column return False if column.coords.height > 2500: # full page-height column return True if column.coords.height / column.stats['lines'] > 100: # lines are far apart, probably something wrong return False if column.coords.width > 700 and column.stats['lines'] > 30: return True
[docs] def is_noise_column(column: pdm.PageXMLColumn) -> bool: """Check if columns contains only very short lines.""" for line in column.get_lines(): if line.text and len(line.text) > 3: return False return True
[docs] def determine_column_type(column: pdm.PageXMLColumn) -> str: """Determine whether a column is a full-text column, margin column or extra text column.""" if is_full_text_column(column): return 'full_text' elif is_text_column(column): return 'extra_text' elif is_header_footer_column(column): return 'header_footer' elif is_noise_column(column): return 'noise_column' else: print('Bounding box:', column.coords.box) print('Stats:', column.stats) num_chars = 0 for line in column.get_lines(): print(line.coords.box, line.text) num_chars += len(line.text) print('num_chars:', num_chars) raise TypeError('unknown column type')
[docs] def make_derived_column(lines: List[pdm.PageXMLTextLine], metadata: dict, page_id: str) -> pdm.PageXMLColumn: """Make a new PageXMLColumn based on a set of lines, column metadata and a page_id.""" coords = pdm.parse_derived_coords(lines) column = pdm.PageXMLColumn(metadata=metadata, coords=coords, lines=lines) column.set_derived_id(page_id) return column
[docs] def merge_columns(columns: List[pdm.PageXMLColumn], doc_id: str, metadata: dict) -> pdm.PageXMLColumn: """Merge a list of columns into one. First, all text regions of all columns are checked for spatial overlap, whereby overlapping text regions are merged. Within the merged text regions, lines are sorted by baseline height.""" trs = [tr for col in columns for tr in col.text_regions] merged_tr = pagexml_helper.merge_textregions(trs, metadata) merged_coords = copy.deepcopy(merged_tr.coords) merged_col = pdm.PageXMLColumn(doc_id=doc_id, doc_type='index_column', metadata=metadata, coords=merged_coords, text_regions=[merged_tr]) merged_col.set_as_parent([merged_tr]) return merged_col
[docs] def sort_lines_in_column_ranges(lines: List[pdm.PageXMLTextLine], column_ranges: List[Dict[str, int]], overlap_threshold: float, debug: bool = False ) -> Tuple[List[List[pdm.PageXMLTextLine]], List[pdm.PageXMLTextLine]]: column_lines = [[] for _ in range(len(column_ranges))] extra_lines = [] append_count = 0 for line in lines: index = None for column_range in column_ranges: if line.coords.width == 0: continue if within_column(line, column_range, overlap_threshold=overlap_threshold): index = column_ranges.index(column_range) column_lines[index].append(line) append_count += 1 if index is None: extra_lines.append(line) append_count += 1 if debug: print('RANGE SPLIT num_lines:', len(lines), 'append_count:', append_count) for ci, lines in enumerate(column_lines): print('\tcolumn', ci, '\tlines:', len(lines)) print('\textra lines:', len(extra_lines)) return column_lines, extra_lines
[docs] def merge_overlapping_columns(text_region: pdm.PageXMLTextRegion, columns: List[pdm.PageXMLColumn]): # column range may have expanded with lines partially overlapping initial range # check which extra lines should be added to columns merge_sets = find_overlapping_columns(columns) merge_cols = {col for merge_set in merge_sets for col in merge_set} non_overlapping_cols = [col for col in columns if col not in merge_cols] for merge_set in merge_sets: merged_col = merge_columns(merge_set, "temp_id", merge_set[0].metadata) if text_region.parent and text_region.parent.id: merged_col.set_derived_id(text_region.parent.id) merged_col.set_parent(text_region.parent) else: merged_col.set_derived_id(text_region.id) non_overlapping_cols.append(merged_col) return non_overlapping_cols
[docs] def make_column_range_columns(text_region: pdm.PageXMLTextRegion, column_lines: List[List[pdm.PageXMLTextLine]] ) -> List[pdm.PageXMLColumn]: columns = [] for lines in column_lines: if len(lines) == 0: continue coords = pdm.parse_derived_coords(lines) column = pdm.PageXMLColumn(doc_type=copy.deepcopy(text_region.type), metadata=copy.deepcopy(text_region.metadata), coords=coords, lines=lines) if text_region.parent and text_region.parent.id: column.set_derived_id(text_region.parent.id) column.set_parent(text_region.parent) else: column.set_derived_id(text_region.id) columns.append(column) columns = merge_overlapping_columns(text_region, columns) return columns
[docs] def handle_extra_lines(text_region: pdm.PageXMLTextRegion, columns: List[pdm.PageXMLColumn], extra_lines: List[pdm.PageXMLTextLine], gap_threshold: int = 50, debug: bool = False): non_col_lines = [] if debug: print("NUM COLUMNS:", len(columns)) print("EXTRA LINES BEFORE:", len(extra_lines)) for line in extra_lines: print('\tEXTRA LINE:', line.text, line.coords) append_count = 0 for line in extra_lines: best_overlap = 0 best_column = None for column in columns: # print("EXTRA LINE CHECKING OVERLAP:", line.coords.left, line.coords.right, # column.coords.left, column.coords.right) overlap = pdm.get_horizontal_overlap(line, column) # print('\tOVERLAP', overlap) if overlap > best_overlap: if best_column is None or column.coords.width < best_column.coords.width: best_column = column best_overlap = overlap # print('\t\tBEST', best_column) if best_column is not None and pdm.is_horizontally_overlapping(line, best_column): best_column.lines.append(line) append_count += 1 best_column.coords = pdm.parse_derived_coords(best_column.lines) if text_region.parent: best_column.set_derived_id(text_region.parent.id) else: # print(f"APPENDING NON-COL LINE: {line.coords.left}-{line.coords.right}\t{line.coords.y}\t{line.text}") non_col_lines.append(line) append_count += 1 if debug is True: print('append_count:', append_count) extra_lines = non_col_lines if debug is True: print("EXTRA LINES AFTER:", len(extra_lines)) extra = None if len(extra_lines) > 0: try: coords = pdm.parse_derived_coords(extra_lines) except BaseException: for line in extra_lines: print(line.coords.box, line.text) raise ValueError('Cannot generate column coords for extra lines') extra = pdm.PageXMLTextRegion(metadata=text_region.metadata, coords=coords, lines=extra_lines) if text_region.parent and text_region.parent.id: extra.set_derived_id(text_region.parent.id) extra.set_parent(text_region.parent) else: extra.set_derived_id(text_region.id) # for line in extra.lines: # print(f"RETURNING EXTRA LINE: {line.coords.left}-{line.coords.right}\t{line.coords.y}\t{line.text}") if debug: print('SPLITTING EXTRA') extra_cols = split_lines_on_column_gaps(extra, gap_threshold=gap_threshold) for extra_col in extra_cols: if debug: print('\tEXTRA COL AFTER EXTRA SPLIT:', extra_col.stats) extra_col.set_parent(text_region.parent) if text_region.parent: extra_col.set_derived_id(text_region.parent.id) columns += extra_cols extra = None if extra is not None: print('source doc:', text_region.id) print(extra) raise TypeError(f'Extra is not None but {type(extra)}') return columns
[docs] def split_lines_on_column_gaps(text_region: pdm.PageXMLTextRegion, gap_threshold: int = 50, overlap_threshold: float = 0.5) -> List[pdm.PageXMLColumn]: """Takes a PageXMLTextRegion object and tries to split the lines into columns based on a minimum horizontal gap (in number of pixels) between columns. :param text_region: a text region with lines (as direct children or as deeper descendants). :type text_region: PageXMLTextRegion :param gap_threshold: the minimum number of horizontal pixels between columns of horizontally aligned lines to be considered a column boundary. Default is 50. :type gap_threshold: int :param overlap_threshold: the minimum overlap ratio between two lines to be considered horizontally aligned (i.e. part of the same 'column'). Default is 0.5, that is, two lines need to horizontally overlap at least 50% of the shortest line. :type overlap_threshold: float """ column_ranges = find_column_gaps(text_region.get_lines(), gap_threshold=gap_threshold) column_ranges = [col_range for col_range in column_ranges if col_range["end"] - col_range["start"] >= 20] column_lines, extra_lines = sort_lines_in_column_ranges(text_region.get_lines(), column_ranges, overlap_threshold) columns = make_column_range_columns(text_region, column_lines) columns = handle_extra_lines(text_region, columns, extra_lines) return columns