Source code for pagexml.model.physical_document_model

from __future__ import annotations

import json
from collections import defaultdict
from typing import Dict, List, Set, Tuple, Union

import numpy as np
from scipy.spatial import ConvexHull


[docs] def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, int]]: """Parse a string of PageXML image coordinates into a list of coordinates.""" if isinstance(points, str): points = [point.split(',') for point in points.split(' ')] return [(int(point[0]), int(point[1])) for point in points if len(point) == 2] elif isinstance(points, list): if len(points) == 0: raise IndexError("point list cannot be empty") for point in points: if not isinstance(point, list) and not isinstance(point, tuple): print(point) print(type(point)) raise TypeError("List of points must be list of tuples with (int, int)") if not isinstance(point[0], int) or not isinstance(point[1], int): raise TypeError("List of points must be list of tuples with (int, int)") return points
[docs] class Coords: def __init__(self, points: Union[str, List[Tuple[int, int]]]): self.points: List[Tuple[int, int]] = parse_points(points) self.point_string = " ".join( ",".join([str(point[0]), str(point[1])]) for point in self.points ) self.x = min([point[0] for point in self.points]) self.y = min([point[1] for point in self.points]) self.w = max([point[0] for point in self.points]) - self.x self.h = max([point[1] for point in self.points]) - self.y self.type = "coords" def __repr__(self): return f'{self.__class__.__name__}(points="{self.point_string}")' def __str__(self): return self.__repr__() @property def json(self): return { 'type': self.type, 'points': self.points } @property def left(self): return self.x @property def right(self): return self.x + self.w @property def top(self): return self.y @property def bottom(self): return self.y + self.h @property def height(self): return self.h @property def width(self): return self.w @property def box(self): return {"x": self.x, "y": self.y, "w": self.w, "h": self.h}
[docs] class Baseline(Coords): def __init__(self, points: Union[str, List[Tuple[int, int]]]): super().__init__(points) self.type = "baseline"
[docs] def find_baseline_overlap_start_indexes(baseline1: Baseline, baseline2: Baseline) -> Tuple[int, int]: """Find the first point in each baseline where the two start to horizontally overlap.""" baseline1_start_index = 0 baseline2_start_index = 0 for bi1, p1 in enumerate(baseline1.points): if bi1 < len(baseline1.points) - 1 and baseline1.points[bi1 + 1][0] < baseline2.points[0][0]: continue baseline1_start_index = bi1 break for bi2, p2 in enumerate(baseline2.points): if bi2 < len(baseline2.points) - 1 and baseline2.points[bi2 + 1][0] < baseline1.points[0][0]: continue baseline2_start_index = bi2 break return baseline1_start_index, baseline2_start_index
[docs] def baseline_is_below(baseline1: Baseline, baseline2: Baseline) -> bool: """Test if baseline 1 is directly below baseline 2""" num_below = 0 num_overlap = 0 # find the indexes of the first baseline points where the two lines horizontally overlap index1, index2 = find_baseline_overlap_start_indexes(baseline1, baseline2) while True: # check if the current baseline point of line 1 is below that of the one of line 2 if baseline1.points[index1][1] > baseline2.points[index2][1]: num_below += 1 num_overlap += 1 # Check which baseline index to move forward for the next test if baseline1.points[index1][0] <= baseline2.points[index2][0]: # if current point of baseline 1 is to the left of the current point of baseline 2 # move to the next point of baseline 1 index1 += 1 else: # otherwise, move to the next points of baseline 2 index2 += 1 if len(baseline1.points) == index1 or len(baseline2.points) == index2: # if the end of one of the baselines is reached, counting is done break # baseline 1 is below baseline 2 if the majority of # the horizontally overlapping points is below return num_below / num_overlap > 0.5
[docs] def has_baseline(doc: PageXMLDoc) -> bool: if isinstance(doc, PageXMLTextLine): return doc.baseline is not None else: return False
[docs] def get_horizontal_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: if isinstance(doc1, PageXMLTextLine) and isinstance(doc2, PageXMLTextLine) and \ doc1.baseline is not None and doc2.baseline is not None: overlap_left = max([doc1.baseline.left, doc2.baseline.left]) overlap_right = min([doc1.baseline.right, doc2.baseline.right]) else: overlap_left = max([doc1.coords.left, doc2.coords.left]) overlap_right = min([doc1.coords.right, doc2.coords.right]) return overlap_right - overlap_left if overlap_right > overlap_left else 0
[docs] def get_vertical_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: overlap_top = max([doc1.coords.top, doc2.coords.top]) overlap_bottom = min([doc1.coords.bottom, doc2.coords.bottom]) return overlap_bottom - overlap_top if overlap_bottom > overlap_top else 0
[docs] def is_vertically_overlapping(region1: PageXMLDoc, region2: PageXMLDoc, threshold: float = 0.5) -> bool: if region1.coords is None: raise ValueError(f"No coords for {region1.id}") elif region2.coords is None: raise ValueError(f"No coords for {region2.id}") if region1.coords.height == 0 and region2.coords.height == 0: return False elif region1.coords.height == 0: return region2.coords.top <= region1.coords.top <= region2.coords.bottom elif region2.coords.height == 0: return region1.coords.top <= region2.coords.top <= region1.coords.bottom v_overlap = get_vertical_overlap(region1, region2) return v_overlap / min(region1.coords.height, region2.coords.height) > threshold
[docs] def is_horizontally_overlapping(region1: PageXMLDoc, region2: PageXMLDoc, threshold: float = 0.5) -> bool: if region1.coords is None: raise ValueError(f"No coords for {region1.id}") elif region2.coords is None: raise ValueError(f"No coords for {region2.id}") h_overlap = get_horizontal_overlap(region1, region2) if region1.coords.width == 0 and region2.coords.width == 0: return False elif region1.coords.width == 0: return region2.coords.left <= region1.coords.left <= region2.coords.right elif region2.coords.width == 0: return region1.coords.left <= region2.coords.left <= region1.coords.right return h_overlap / min(region1.coords.width, region2.coords.width) > threshold
[docs] def is_below(region1: PageXMLTextRegion, region2: PageXMLTextRegion, margin: int = 20) -> bool: if is_horizontally_overlapping(region1, region2): return region1.coords.top > region2.coords.bottom - margin else: return False
[docs] def is_next_to(region1: PageXMLTextRegion, region2: PageXMLTextRegion, margin: int = 20) -> bool: if is_vertically_overlapping(region1, region2): return region1.coords.left > region2.coords.right - margin else: return False
[docs] def horizontal_distance(doc1: PageXMLDoc, doc2: PageXMLDoc): if doc1.coords.right < doc2.coords.left: # doc1 is to the left of doc2 return doc2.coords.left - doc1.coords.right elif doc1.coords.left > doc2.coords.right: # doc1 is to the right of doc2 return doc1.coords.left - doc2.coords.right else: # doc1 and doc2 horizontally overlap return 0
[docs] def vertical_distance(doc1: PageXMLDoc, doc2: PageXMLDoc): if doc1.coords.bottom < doc2.coords.top: # doc1 is above doc2 return doc2.coords.top - doc1.coords.bottom elif doc1.coords.top > doc2.coords.bottom: # doc1 is below doc2 return doc1.coords.top - doc2.coords.bottom else: # doc1 and doc2 vertically overlap return 0
[docs] def get_horizontal_diff(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: if isinstance(doc1, PageXMLTextLine) and isinstance(doc2, PageXMLTextLine) and \ doc1.baseline is not None and doc2.baseline is not None: return abs(doc1.baseline.left - doc2.baseline.left) else: return abs(doc1.coords.left - doc2.coords.left)
[docs] def get_horizontal_diff_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float: horizontal_diff = get_horizontal_diff(doc1, doc2) max_right = max(doc1.coords.right, doc2.coords.right) min_left = min(doc1.coords.left, doc2.coords.left) return horizontal_diff / (max_right - min_left)
[docs] def get_horizontal_overlap_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float: horizontal_overlap = get_horizontal_overlap(doc1, doc2) max_right = max(doc1.coords.right, doc2.coords.right) min_left = min(doc1.coords.left, doc2.coords.left) return horizontal_overlap / (max_right - min_left)
[docs] def get_vertical_diff(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: if isinstance(doc1, PageXMLTextLine) and isinstance(doc2, PageXMLTextLine) and \ doc1.baseline is not None and doc2.baseline is not None: return abs(doc1.baseline.top - doc2.baseline.top) else: return abs(doc1.coords.top - doc2.coords.top)
[docs] def get_vertical_diff_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float: vertical_diff = get_vertical_diff(doc1, doc2) max_bottom = max(doc1.coords.bottom, doc2.coords.bottom) min_top = min(doc1.coords.top, doc2.coords.top) return vertical_diff / (max_bottom - min_top)
[docs] def get_vertical_overlap_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float: vertical_overlap = get_vertical_overlap(doc1, doc2) max_bottom = max(doc1.coords.bottom, doc2.coords.bottom) min_top = min(doc1.coords.top, doc2.coords.top) return vertical_overlap / (max_bottom - min_top)
[docs] def sort_lines(line1: PageXMLTextLine, line2: PageXMLTextLine, as_column: bool = True): if get_horizontal_overlap(line1, line2): if get_vertical_overlap(line1, line2): # check which orientation dominates the difference horizontal_ratio = get_horizontal_diff_ratio(line1, line2) vertical_ratio = get_vertical_diff_ratio(line1, line2) if vertical_ratio < 0.2 and horizontal_ratio > 0.8: return line1.coords.left < line2.coords.left else: return line1.coords.top < line2.coords.top else: return line1.is_below(line2) is False elif get_vertical_overlap(line1, line2): return line1.coords.left < line2.coords.left elif as_column is True: # assume lines in a single column, so read from top to bottom return line1.coords.top < line2.coords.top else: # assume lines in multiple columns, so read from left to right return line1.coords.left < line2.coords.left
[docs] def parse_derived_coords(document_list: list) -> Coords: """Derive scan coordinates for a composite document based on the list of documents it contains. A convex hull is drawn around all points of all contained documents.""" return coords_list_to_hull_coords([document.coords for document in document_list])
[docs] def coords_list_to_hull_coords(coords_list): # print(coords_list) points = np.array([point for coords in coords_list for point in coords.points]) # print(points) try: edges = points_to_hull_edges(points) except IndexError: print([coords for coords in coords_list]) print('points:', points) raise # print(edges) hull_points = edges_to_hull_points(edges) return Coords(hull_points)
[docs] def points_to_hull_edges(points): hull = ConvexHull(points) edges = defaultdict(dict) for simplex in hull.simplices: p1 = (int(points[simplex, 0][0]), int(points[simplex, 1][0])) p2 = (int(points[simplex, 0][1]), int(points[simplex, 1][1])) edges[p2][p1] = 1 edges[p1][p2] = 1 return edges
[docs] def edges_to_hull_points(edges): nodes = list(edges.keys()) curr_point = sorted(nodes)[0] sorted_nodes = [curr_point] while len(sorted_nodes) < len(nodes): for next_point in edges[curr_point]: if next_point not in sorted_nodes: sorted_nodes.append(next_point) curr_point = next_point break return sorted_nodes
[docs] class StructureDoc: def __init__(self, doc_id: Union[None, str] = None, doc_type: Union[None, str, List[str]] = None, metadata: Dict[str, any] = None, reading_order: Dict[int, str] = None): self.id = doc_id self.type = doc_type self.main_type = 'doc' self.metadata = metadata if metadata else {} # if self.id and 'id' not in self.metadata: # self.metadata['id'] = self.id # if self.metadata and 'type' not in self.metadata: # self.metadata['type'] = self.type self.reading_order: Dict[int, str] = reading_order if reading_order else {} self.reading_order_number = {} self.parent: Union[StructureDoc, None] = None self.logical_parent: Union[StructureDoc, None] = None
[docs] def set_parent(self, parent: StructureDoc): """Set parent document and add metadata of parent to this document's metadata""" self.parent = parent self.add_parent_id_to_metadata()
[docs] def add_type(self, doc_type: Union[str, List[str]]) -> None: doc_types = [doc_type] if isinstance(doc_type, str) else doc_type if isinstance(self.type, str): self.type = [self.type] for doc_type in doc_types: if doc_type not in self.type: self.type.append(doc_type)
[docs] def remove_type(self, doc_type: Union[str, List[str]]) -> None: doc_types = [doc_type] if isinstance(doc_type, str) else doc_type if isinstance(self.type, str): self.type = [self.type] for doc_type in doc_types: if doc_type in self.type: self.type.remove(doc_type) if len(self.type) == 1: self.type = self.type[0]
[docs] def has_type(self, doc_type: str) -> bool: if isinstance(self.type, str): return doc_type == self.type else: return doc_type in self.type
@property def types(self) -> Set[str]: if isinstance(self.type, str): return {self.type} else: return set(self.type)
[docs] def set_as_parent(self, children: List[StructureDoc]): """Set this document as parent of a list of child documents""" for child in children: child.set_parent(self)
[docs] def add_parent_id_to_metadata(self): if self.parent: self.metadata['parent_type'] = self.parent.main_type self.metadata['parent_id'] = self.parent.id if hasattr(self.parent, 'main_type'): self.metadata[f'{self.parent.main_type}_id'] = self.parent.id if self.logical_parent: self.metadata['logical_parent_type'] = self.logical_parent.main_type self.metadata['logical_parent_id'] = self.logical_parent.id if hasattr(self.logical_parent, 'main_type'): self.metadata[f'{self.logical_parent.main_type}_id'] = self.logical_parent.id
@property def json(self) -> Dict[str, any]: json_data = { 'id': self.id, 'type': self.type, 'metadata': self.metadata } if self.reading_order: json_data['reading_order'] = self.reading_order return json_data
[docs] class PhysicalStructureDoc(StructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type=doc_type, metadata=metadata, reading_order=reading_order) self.coords: Union[None, Coords] = coords self.main_type = 'physical_structure_doc' @property def json(self) -> Dict[str, any]: doc_json = super().json if self.coords: doc_json['coords'] = self.coords.points return doc_json
[docs] def set_derived_id(self, parent_id: str): box_string = f"{self.coords.x}-{self.coords.y}-{self.coords.w}-{self.coords.h}" self.id = f"{parent_id}-{self.main_type}-{box_string}"
# self.metadata['id'] = self.id
[docs] class LogicalStructureDoc(StructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, lines: List[PageXMLTextLine] = None, text_regions: List[PageXMLTextRegion] = None, reading_order: Dict[int, str] = None): super().__init__(doc_id, doc_type, metadata, reading_order=reading_order) self.lines: List[PageXMLTextLine] = lines if lines else [] self.text_regions: List[PageXMLTextRegion] = text_regions if text_regions else [] self.logical_parent: Union[StructureDoc, None] = None
[docs] def set_logical_parent(self, parent: StructureDoc): """Set parent document and add metadata of parent to this document's metadata""" self.logical_parent = parent self.add_logical_parent_id_to_metadata()
[docs] def set_as_logical_parent(self, children: Union[StructureDoc, List[StructureDoc]]): if isinstance(children, StructureDoc): children = [children] for child in children: child.parent = self
[docs] def add_logical_parent_id_to_metadata(self): if self.logical_parent: self.metadata['logical_parent_type'] = self.logical_parent.main_type self.metadata['logical_parent_id'] = self.logical_parent.id if hasattr(self.logical_parent, 'main_type'): self.metadata[f'{self.logical_parent.main_type}_id'] = self.logical_parent.id
[docs] class PageXMLDoc(PhysicalStructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type="pagexml_doc", metadata=metadata, reading_order=reading_order) self.coords: Union[None, Coords] = coords self.add_type(doc_type) self.main_type = 'pagexml_doc' @property def stats(self): return {}
[docs] class PageXMLWord(PageXMLDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, conf: float = None, text: str = None): super().__init__(doc_id, "word", metadata, coords) self.conf = conf self.text = text self.main_type = 'word' if doc_type: self.add_type(doc_type) def __repr__(self): content_string = f"id={self.id}, type={self.type}, text={self.text}" if self.conf is not None: content_string += f", conf={self.conf}" return f"{self.__class__.__name__}({content_string})" @property def json(self) -> Dict[str, any]: doc_json = super().json doc_json['text'] = self.text if self.conf: doc_json['conf'] = self.conf return doc_json
[docs] class PageXMLTextLine(PageXMLDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, baseline: Baseline = None, xheight: int = None, conf: float = None, text: str = None, words: List[PageXMLWord] = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type="line", metadata=metadata, coords=coords, reading_order=reading_order) self.main_type = 'line' self.conf = conf self.text: Union[None, str] = text self.xheight: Union[None, int] = xheight self.baseline: Union[None, Baseline] = baseline self.words: List[PageXMLWord] = words if words else [] self.metadata['type'] = 'line' self.set_as_parent(self.words) if doc_type: self.add_type(doc_type) def __repr__(self): content_string = f"id={self.id}, type={self.type}, text=\"{self.text}\" conf={self.conf}" return f"{self.__class__.__name__}({content_string})" def __lt__(self, other: PageXMLTextLine): """For sorting text lines. Assumptions: reading from left to right, top to bottom. If two lines are horizontally overlapping, sort from top to bottom, even if the upper lines is more horizontally indented.""" if other == self: return False return sort_lines(self, other, as_column=True) @property def json(self) -> Dict[str, any]: doc_json = super().json doc_json['text'] = self.text if self.conf is not None: doc_json['conf'] = self.conf if self.baseline: doc_json['baseline'] = self.baseline.points if self.words: doc_json['words'] = [word.json for word in self.words] if self.xheight: doc_json['xheight'] = self.xheight return doc_json @property def stats(self): return { 'words': self.num_words }
[docs] def get_words(self): if self.words: return self.words elif self.text: return self.text.split(' ') else: return []
@property def num_words(self): return len(self.get_words())
[docs] def is_below(self, other: PageXMLTextLine) -> bool: """Test if the baseline of this line is directly below the baseline of the other line.""" # if there is no horizontal overlap, this line is not directly below the other if not get_horizontal_overlap(self, other): # print("NO HORIZONTAL OVERLAP") return False # if the bottom of this line is above the top of the other line, this line is above the other if self.baseline.bottom < other.baseline.top: # print("BOTTOM IS ABOVE TOP") return False # if most of this line's baseline points are not below most the other's baseline points # this line is not below the other if baseline_is_below(self.baseline, other.baseline): # print("BASELINE IS BELOW") return True return False
[docs] def is_next_to(self, other: PageXMLTextLine) -> bool: """Test if this line is vertically aligned with the other line.""" if get_vertical_overlap(self, other) == 0: # print("NO VERTICAL OVERLAP") return False if get_horizontal_overlap(self, other) > 40: # print("TOO MUCH HORIZONTAL OVERLAP", horizontal_overlap(self.coords, other.coords)) return False if self.baseline.top > other.baseline.bottom + 10: # print("VERTICAL BASELINE GAP TOO BIG") return False elif self.baseline.bottom < other.baseline.top - 10: return False else: return True
[docs] class PageXMLTextRegion(PageXMLDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, text: str = None, orientation: float = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type="text_region", metadata=metadata, coords=coords, reading_order=reading_order) self.main_type = 'text_region' self.text_regions: List[PageXMLTextRegion] = text_regions if text_regions is not None else [] self.lines: List[PageXMLTextLine] = lines if lines is not None else [] self.orientation: Union[None, float] = orientation self.reading_order_number = {} self.text = text if self.lines is not None: self.set_as_parent(self.lines) if self.lines is not None: self.set_as_parent(self.lines) if self.text_regions is not None: self.set_as_parent(self.text_regions) if self.reading_order: self.set_text_regions_in_reader_order() if doc_type: self.add_type(doc_type) def __repr__(self): stats = json.dumps(self.stats) content_string = f"\n\tid={self.id}, \n\ttype={self.type}, \n\tstats={stats}" return f"{self.__class__.__name__}({content_string}\n)" def __lt__(self, other: PageXMLTextRegion): """For sorting text regions. Assumptions: reading from left to right, top to bottom. If two regions are horizontally overlapping, sort from top to bottom, even if the upper region is more horizontally indented.""" if other == self: return False if is_horizontally_overlapping(self, other): return self.coords.top < other.coords.top else: return self.coords.left < other.coords.left
[docs] def add_child(self, child: PageXMLDoc): child.set_parent(self) if isinstance(child, PageXMLTextLine): self.lines.append(child) elif isinstance(child, PageXMLTextRegion): self.text_regions.append(child) else: raise TypeError(f'unknown child type: {child.__class__.__name__}') self.coords = parse_derived_coords(self.text_regions + self.lines)
@property def json(self) -> Dict[str, any]: doc_json = super().json if self.text: doc_json['text'] = self.text if self.lines: doc_json['lines'] = [line.json for line in self.lines] if self.text_regions: doc_json['text_regions'] = [text_region.json for text_region in self.text_regions] if self.orientation: doc_json['orientation'] = self.orientation doc_json['stats'] = self.stats return doc_json
[docs] def get_text_regions_in_reading_order(self): if not self.reading_order: return self.text_regions tr_ids = list({region_id: None for _index, region_id in sorted(self.reading_order.items(), key=lambda x: x[0])}) tr_map = {} for text_region in self.text_regions: # if text_region.id not in tr_ids: # print("reading order:", self.reading_order) # raise KeyError(f"text_region with id {text_region.id} is not listed in reading_order") tr_map[text_region.id] = text_region return [tr_map[tr_id] for tr_id in tr_ids if tr_id in tr_map]
[docs] def set_text_regions_in_reader_order(self): tr_ids = [tr.id for tr in self.text_regions] for order_number in self.reading_order: text_region_id = self.reading_order[order_number] self.reading_order_number[text_region_id] = order_number for tr_id in tr_ids: if tr_id not in self.reading_order_number: # there is a text_region that was not in the original PageXML output: # ignore reading order self.reading_order = None return None self.text_regions = self.get_text_regions_in_reading_order()
[docs] def get_all_text_regions(self): text_regions: Set[PageXMLTextRegion] = set() for text_region in self.text_regions: text_regions.add(text_region) if text_region.text_regions: text_regions += text_region.get_all_text_regions() return text_regions
[docs] def get_inner_text_regions(self) -> List[PageXMLTextRegion]: text_regions: List[PageXMLTextRegion] = [] for text_region in self.text_regions: if text_region.text_regions: text_regions += text_region.get_inner_text_regions() elif text_region.lines: text_regions.append(text_region) if not self.text_regions and self.lines: text_regions.append(self) return text_regions
[docs] def get_lines(self) -> List[PageXMLTextLine]: lines: List[PageXMLTextLine] = [] if self.text_regions: if self.reading_order and all([tr.id in self.reading_order for tr in self.text_regions]): for tr in sorted(self.text_regions, key=lambda t: self.reading_order_number[t.id]): lines += tr.get_lines() else: for text_region in sorted(self.text_regions): lines += text_region.get_lines() if self.lines: lines += self.lines return lines
[docs] def get_words(self) -> Union[List[str], List[PageXMLWord]]: words = [] if self.text is not None: return self.text.split(' ') if self.lines: for line in self.lines: if line.words: words += line.words elif line.text: words += line.text.split(' ') if self.text_regions: for tr in self.text_regions: words += tr.get_words() return words
@property def num_lines(self): return len(self.get_lines()) @property def num_words(self): return len(self.get_words()) @property def num_text_regions(self): return len(self.text_regions) @property def stats(self): return { 'lines': self.num_lines, 'words': self.num_words, 'text_regions': self.num_text_regions }
[docs] class PageXMLColumn(PageXMLTextRegion): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type="column", metadata=metadata, coords=coords, lines=lines, text_regions=text_regions, reading_order=reading_order) self.main_type = 'column' if doc_type: self.add_type(doc_type) @property def json(self) -> Dict[str, any]: doc_json = super().json doc_json['stats'] = self.stats return doc_json @property def stats(self): stats = super().stats return stats
[docs] class PageXMLPage(PageXMLTextRegion): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, columns: List[PageXMLColumn] = None, text_regions: List[PageXMLTextRegion] = None, extra: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type="page", metadata=metadata, coords=coords, lines=lines, text_regions=text_regions, reading_order=reading_order) self.main_type = 'page' self.columns: List[PageXMLColumn] = columns if columns else [] self.extra: List[PageXMLTextRegion] = extra if extra else [] self.set_as_parent(self.columns) self.set_as_parent(self.extra) if doc_type: self.add_type(doc_type)
[docs] def get_lines(self): lines = [] if self.columns: # First, add lines from columns for column in sorted(self.columns): lines += column.get_lines() # Second, add lines from text_regions if self.extra: for tr in self.extra: lines += tr.get_lines() if self.text_regions: # print('get_lines - reading_order_number:', self.reading_order_number) # print('get_lines - reading_order:', self.reading_order) if self.reading_order and all([tr.id in self.reading_order for tr in self.text_regions]): for tr in sorted(self.text_regions, key=lambda t: self.reading_order_number[t]): lines += tr.get_lines() else: for tr in sorted(self.text_regions): lines += tr.get_lines() if self.lines: raise AttributeError(f'page {self.id} has lines as direct property') return lines
[docs] def add_child(self, child: PageXMLDoc, as_extra: bool = False): # print('as_extra:', as_extra) # print('stats before adding:', self.stats) child.set_parent(self) if as_extra and (isinstance(child, PageXMLColumn) or isinstance(child, PageXMLTextRegion)): self.extra.append(child) elif isinstance(child, PageXMLColumn) or child.__class__.__name__ == 'PageXMLColumn': self.columns.append(child) elif isinstance(child, PageXMLTextLine): self.lines.append(child) elif isinstance(child, PageXMLTextRegion): self.text_regions.append(child) else: raise TypeError(f'unknown child type: {child.__class__.__name__}') self.coords = parse_derived_coords(self.extra + self.columns + self.text_regions + self.lines)
# print('stats after adding:', self.stats)
[docs] def get_all_text_regions(self): text_regions = [tr for col in self.columns for tr in col.text_regions] text_regions.extend([tr for tr in self.extra]) return text_regions
[docs] def get_text_regions_in_reading_order(self, include_extra: bool = True): text_regions = [] if len(self.text_regions) > 0: text_regions.extend(self.text_regions) if hasattr(self, 'columns'): for col in sorted(self.columns): text_regions.extend(col.get_text_regions_in_reading_order()) if include_extra and hasattr(self, 'extra'): text_regions.extend(sorted(self.extra)) return text_regions
[docs] def get_inner_text_regions(self) -> List[PageXMLTextRegion]: text_regions = self.get_all_text_regions() inner_trs = [] for tr in text_regions: inner_trs.extend(tr.get_inner_text_regions()) return inner_trs
@property def json(self) -> Dict[str, any]: doc_json = super().json # if self.lines: # doc_json['lines'] = [line.json for line in self.lines] # if self.text_regions: # doc_json['text_regions'] = [text_region.json for text_region in self.text_regions] if self.columns: doc_json['columns'] = [column.json for column in self.columns] if self.extra: doc_json['extra'] = [text_region.json for text_region in self.extra] doc_json['stats'] = self.stats return doc_json @property def stats(self): """Pages diverge from other types since they have columns and extra text regions, or plain text regions, so have their own way of calculating stats.""" lines = self.get_lines() stats = { "words": sum([len(line.get_words()) for line in lines]), "lines": len(lines) } if self.columns: stats['columns'] = len(self.columns) if self.extra: stats['extra'] = len(self.extra) if self.text_regions: stats['text_regions'] = len(self.text_regions) return stats
[docs] class PageXMLScan(PageXMLTextRegion): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, metadata: Dict[str, any] = None, coords: Coords = None, pages: List[PageXMLPage] = None, columns: List[PageXMLColumn] = None, text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type="scan", metadata=metadata, coords=coords, lines=lines, text_regions=text_regions, reading_order=reading_order) self.main_type = 'scan' self.pages: List[PageXMLPage] = pages if pages else [] self.columns: List[PageXMLColumn] = columns if columns else [] self.set_as_parent(self.pages) self.set_as_parent(self.columns) if doc_type: self.add_type(doc_type) self.set_scan_id_as_metadata()
[docs] def add_child(self, child: PageXMLDoc): child.set_parent(self) if isinstance(child, PageXMLPage): self.pages.append(child) elif isinstance(child, PageXMLColumn): self.columns.append(child) elif isinstance(child, PageXMLTextRegion): self.text_regions.append(child) elif isinstance(child, PageXMLTextLine): self.lines.append(child)
[docs] def set_scan_id_as_metadata(self): self.metadata['scan_id'] = self.id for tr in self.get_all_text_regions(): tr.metadata['scan_id'] = self.id for line in self.get_lines(): line.metadata['scan_id'] = self.id for word in self.get_words(): if isinstance(word, PageXMLWord): word.metadata['scan_id'] = self.id
@property def json(self) -> Dict[str, any]: doc_json = super().json # if self.lines: # doc_json['lines'] = [line.json for line in self.lines] # if self.text_regions: # doc_json['text_regions'] = [text_region.json for text_region in self.text_regions] if self.columns: doc_json['columns'] = [line.json for line in self.columns] if self.pages: doc_json['pages'] = [line.json for line in self.pages] doc_json['stats'] = self.stats return doc_json @property def stats(self): stats = super().stats stats['columns'] = len([column for page in self.pages for column in page.columns]) stats['extra'] = len([text_region for page in self.pages for text_region in page.extra]) stats['pages'] = len(self.pages) return stats
[docs] def set_parentage(parent_doc: StructureDoc): if hasattr(parent_doc, 'pages') and parent_doc.pages: parent_doc.set_as_parent(parent_doc.pages) for page in parent_doc.pages: set_parentage(page) if hasattr(parent_doc, 'columns') and parent_doc.columns: parent_doc.set_as_parent(parent_doc.columns) for column in parent_doc.columns: set_parentage(column) if hasattr(parent_doc, 'text_regions') and parent_doc.text_regions: parent_doc.set_as_parent(parent_doc.text_regions) for text_region in parent_doc.text_regions: set_parentage(text_region) if hasattr(parent_doc, 'lines') and parent_doc.lines: parent_doc.set_as_parent(parent_doc.lines) for line in parent_doc.lines: set_parentage(line) if hasattr(parent_doc, 'words') and parent_doc.words: parent_doc.set_as_parent(parent_doc.words) for word in parent_doc.words: set_parentage(word)
[docs] def in_same_column(element1: PageXMLDoc, element2: PageXMLDoc) -> bool: """Check if two PageXML elements are part of the same column.""" if ( 'scan_id' in element1.metadata and 'scan_id' in element2.metadata and element1.metadata['scan_id'] != element2.metadata['scan_id'] ): return False if 'column_id' in element1.metadata and 'column_id' in element2.metadata: return element1.metadata['column_id'] == element2.metadata['column_id'] else: # check if the two lines have a horizontal overlap that is more than 50% of the width of line 1 # Note: this doesn't work for short adjacent lines within the same column return get_horizontal_overlap(element1, element2) > (element1.coords.w / 2)