from __future__ import annotations
import json
from collections import defaultdict
from typing import Dict, List, Set, Tuple, Union
import numpy as np
from scipy.spatial import ConvexHull
[docs]
def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, int]]:
"""Parse a string of PageXML image coordinates into a list of coordinates."""
if isinstance(points, str):
points = [point.split(',') for point in points.split(' ')]
return [(int(point[0]), int(point[1])) for point in points if len(point) == 2]
elif isinstance(points, list):
if len(points) == 0:
raise IndexError("point list cannot be empty")
for point in points:
if not isinstance(point, list) and not isinstance(point, tuple):
print(point)
print(type(point))
raise TypeError("List of points must be list of tuples with (int, int)")
if not isinstance(point[0], int) or not isinstance(point[1], int):
raise TypeError("List of points must be list of tuples with (int, int)")
return points
[docs]
class Coords:
def __init__(self, points: Union[str, List[Tuple[int, int]]]):
self.points: List[Tuple[int, int]] = parse_points(points)
self.point_string = " ".join(
",".join([str(point[0]), str(point[1])]) for point in self.points
)
self.x = min([point[0] for point in self.points])
self.y = min([point[1] for point in self.points])
self.w = max([point[0] for point in self.points]) - self.x
self.h = max([point[1] for point in self.points]) - self.y
self.type = "coords"
def __repr__(self):
return f'{self.__class__.__name__}(points="{self.point_string}")'
def __str__(self):
return self.__repr__()
@property
def json(self):
return {
'type': self.type,
'points': self.points
}
@property
def left(self):
return self.x
@property
def right(self):
return self.x + self.w
@property
def top(self):
return self.y
@property
def bottom(self):
return self.y + self.h
@property
def height(self):
return self.h
@property
def width(self):
return self.w
@property
def box(self):
return {"x": self.x, "y": self.y, "w": self.w, "h": self.h}
[docs]
class Baseline(Coords):
def __init__(self, points: Union[str, List[Tuple[int, int]]]):
super().__init__(points)
self.type = "baseline"
[docs]
def find_baseline_overlap_start_indexes(baseline1: Baseline, baseline2: Baseline) -> Tuple[int, int]:
"""Find the first point in each baseline where the two start to horizontally overlap."""
baseline1_start_index = 0
baseline2_start_index = 0
for bi1, p1 in enumerate(baseline1.points):
if bi1 < len(baseline1.points) - 1 and baseline1.points[bi1 + 1][0] < baseline2.points[0][0]:
continue
baseline1_start_index = bi1
break
for bi2, p2 in enumerate(baseline2.points):
if bi2 < len(baseline2.points) - 1 and baseline2.points[bi2 + 1][0] < baseline1.points[0][0]:
continue
baseline2_start_index = bi2
break
return baseline1_start_index, baseline2_start_index
[docs]
def baseline_is_below(baseline1: Baseline, baseline2: Baseline) -> bool:
"""Test if baseline 1 is directly below baseline 2"""
num_below = 0
num_overlap = 0
# find the indexes of the first baseline points where the two lines horizontally overlap
index1, index2 = find_baseline_overlap_start_indexes(baseline1, baseline2)
while True:
# check if the current baseline point of line 1 is below that of the one of line 2
if baseline1.points[index1][1] > baseline2.points[index2][1]:
num_below += 1
num_overlap += 1
# Check which baseline index to move forward for the next test
if baseline1.points[index1][0] <= baseline2.points[index2][0]:
# if current point of baseline 1 is to the left of the current point of baseline 2
# move to the next point of baseline 1
index1 += 1
else:
# otherwise, move to the next points of baseline 2
index2 += 1
if len(baseline1.points) == index1 or len(baseline2.points) == index2:
# if the end of one of the baselines is reached, counting is done
break
# baseline 1 is below baseline 2 if the majority of
# the horizontally overlapping points is below
return num_below / num_overlap > 0.5
[docs]
def has_baseline(doc: PageXMLDoc) -> bool:
if isinstance(doc, PageXMLTextLine):
return doc.baseline is not None
else:
return False
[docs]
def get_horizontal_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int:
if isinstance(doc1, PageXMLTextLine) and isinstance(doc2, PageXMLTextLine) and \
doc1.baseline is not None and doc2.baseline is not None:
overlap_left = max([doc1.baseline.left, doc2.baseline.left])
overlap_right = min([doc1.baseline.right, doc2.baseline.right])
else:
overlap_left = max([doc1.coords.left, doc2.coords.left])
overlap_right = min([doc1.coords.right, doc2.coords.right])
return overlap_right - overlap_left if overlap_right > overlap_left else 0
[docs]
def get_vertical_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int:
overlap_top = max([doc1.coords.top, doc2.coords.top])
overlap_bottom = min([doc1.coords.bottom, doc2.coords.bottom])
return overlap_bottom - overlap_top if overlap_bottom > overlap_top else 0
[docs]
def is_vertically_overlapping(region1: PageXMLDoc,
region2: PageXMLDoc,
threshold: float = 0.5) -> bool:
if region1.coords is None:
raise ValueError(f"No coords for {region1.id}")
elif region2.coords is None:
raise ValueError(f"No coords for {region2.id}")
if region1.coords.height == 0 and region2.coords.height == 0:
return False
elif region1.coords.height == 0:
return region2.coords.top <= region1.coords.top <= region2.coords.bottom
elif region2.coords.height == 0:
return region1.coords.top <= region2.coords.top <= region1.coords.bottom
v_overlap = get_vertical_overlap(region1, region2)
return v_overlap / min(region1.coords.height, region2.coords.height) > threshold
[docs]
def is_horizontally_overlapping(region1: PageXMLDoc,
region2: PageXMLDoc,
threshold: float = 0.5) -> bool:
if region1.coords is None:
raise ValueError(f"No coords for {region1.id}")
elif region2.coords is None:
raise ValueError(f"No coords for {region2.id}")
h_overlap = get_horizontal_overlap(region1, region2)
if region1.coords.width == 0 and region2.coords.width == 0:
return False
elif region1.coords.width == 0:
return region2.coords.left <= region1.coords.left <= region2.coords.right
elif region2.coords.width == 0:
return region1.coords.left <= region2.coords.left <= region1.coords.right
return h_overlap / min(region1.coords.width, region2.coords.width) > threshold
[docs]
def is_below(region1: PageXMLTextRegion, region2: PageXMLTextRegion, margin: int = 20) -> bool:
if is_horizontally_overlapping(region1, region2):
return region1.coords.top > region2.coords.bottom - margin
else:
return False
[docs]
def is_next_to(region1: PageXMLTextRegion, region2: PageXMLTextRegion, margin: int = 20) -> bool:
if is_vertically_overlapping(region1, region2):
return region1.coords.left > region2.coords.right - margin
else:
return False
[docs]
def horizontal_distance(doc1: PageXMLDoc, doc2: PageXMLDoc):
if doc1.coords.right < doc2.coords.left:
# doc1 is to the left of doc2
return doc2.coords.left - doc1.coords.right
elif doc1.coords.left > doc2.coords.right:
# doc1 is to the right of doc2
return doc1.coords.left - doc2.coords.right
else:
# doc1 and doc2 horizontally overlap
return 0
[docs]
def vertical_distance(doc1: PageXMLDoc, doc2: PageXMLDoc):
if doc1.coords.bottom < doc2.coords.top:
# doc1 is above doc2
return doc2.coords.top - doc1.coords.bottom
elif doc1.coords.top > doc2.coords.bottom:
# doc1 is below doc2
return doc1.coords.top - doc2.coords.bottom
else:
# doc1 and doc2 vertically overlap
return 0
[docs]
def get_horizontal_diff(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int:
if isinstance(doc1, PageXMLTextLine) and isinstance(doc2, PageXMLTextLine) and \
doc1.baseline is not None and doc2.baseline is not None:
return abs(doc1.baseline.left - doc2.baseline.left)
else:
return abs(doc1.coords.left - doc2.coords.left)
[docs]
def get_horizontal_diff_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float:
horizontal_diff = get_horizontal_diff(doc1, doc2)
max_right = max(doc1.coords.right, doc2.coords.right)
min_left = min(doc1.coords.left, doc2.coords.left)
return horizontal_diff / (max_right - min_left)
[docs]
def get_horizontal_overlap_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float:
horizontal_overlap = get_horizontal_overlap(doc1, doc2)
max_right = max(doc1.coords.right, doc2.coords.right)
min_left = min(doc1.coords.left, doc2.coords.left)
return horizontal_overlap / (max_right - min_left)
[docs]
def get_vertical_diff(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int:
if isinstance(doc1, PageXMLTextLine) and isinstance(doc2, PageXMLTextLine) and \
doc1.baseline is not None and doc2.baseline is not None:
return abs(doc1.baseline.top - doc2.baseline.top)
else:
return abs(doc1.coords.top - doc2.coords.top)
[docs]
def get_vertical_diff_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float:
vertical_diff = get_vertical_diff(doc1, doc2)
max_bottom = max(doc1.coords.bottom, doc2.coords.bottom)
min_top = min(doc1.coords.top, doc2.coords.top)
return vertical_diff / (max_bottom - min_top)
[docs]
def get_vertical_overlap_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) -> float:
vertical_overlap = get_vertical_overlap(doc1, doc2)
max_bottom = max(doc1.coords.bottom, doc2.coords.bottom)
min_top = min(doc1.coords.top, doc2.coords.top)
return vertical_overlap / (max_bottom - min_top)
[docs]
def sort_lines(line1: PageXMLTextLine, line2: PageXMLTextLine, as_column: bool = True):
if get_horizontal_overlap(line1, line2):
if get_vertical_overlap(line1, line2):
# check which orientation dominates the difference
horizontal_ratio = get_horizontal_diff_ratio(line1, line2)
vertical_ratio = get_vertical_diff_ratio(line1, line2)
if vertical_ratio < 0.2 and horizontal_ratio > 0.8:
return line1.coords.left < line2.coords.left
else:
return line1.coords.top < line2.coords.top
else:
return line1.is_below(line2) is False
elif get_vertical_overlap(line1, line2):
return line1.coords.left < line2.coords.left
elif as_column is True:
# assume lines in a single column, so read from top to bottom
return line1.coords.top < line2.coords.top
else:
# assume lines in multiple columns, so read from left to right
return line1.coords.left < line2.coords.left
[docs]
def parse_derived_coords(document_list: list) -> Coords:
"""Derive scan coordinates for a composite document based on the list of documents it contains.
A convex hull is drawn around all points of all contained documents."""
return coords_list_to_hull_coords([document.coords for document in document_list])
[docs]
def coords_list_to_hull_coords(coords_list):
# print(coords_list)
points = np.array([point for coords in coords_list for point in coords.points])
# print(points)
try:
edges = points_to_hull_edges(points)
except IndexError:
print([coords for coords in coords_list])
print('points:', points)
raise
# print(edges)
hull_points = edges_to_hull_points(edges)
return Coords(hull_points)
[docs]
def points_to_hull_edges(points):
hull = ConvexHull(points)
edges = defaultdict(dict)
for simplex in hull.simplices:
p1 = (int(points[simplex, 0][0]), int(points[simplex, 1][0]))
p2 = (int(points[simplex, 0][1]), int(points[simplex, 1][1]))
edges[p2][p1] = 1
edges[p1][p2] = 1
return edges
[docs]
def edges_to_hull_points(edges):
nodes = list(edges.keys())
curr_point = sorted(nodes)[0]
sorted_nodes = [curr_point]
while len(sorted_nodes) < len(nodes):
for next_point in edges[curr_point]:
if next_point not in sorted_nodes:
sorted_nodes.append(next_point)
curr_point = next_point
break
return sorted_nodes
[docs]
class StructureDoc:
def __init__(self, doc_id: Union[None, str] = None, doc_type: Union[None, str, List[str]] = None,
metadata: Dict[str, any] = None, reading_order: Dict[int, str] = None):
self.id = doc_id
self.type = doc_type
self.main_type = 'doc'
self.metadata = metadata if metadata else {}
# if self.id and 'id' not in self.metadata:
# self.metadata['id'] = self.id
# if self.metadata and 'type' not in self.metadata:
# self.metadata['type'] = self.type
self.reading_order: Dict[int, str] = reading_order if reading_order else {}
self.reading_order_number = {}
self.parent: Union[StructureDoc, None] = None
self.logical_parent: Union[StructureDoc, None] = None
[docs]
def set_parent(self, parent: StructureDoc):
"""Set parent document and add metadata of parent to this document's metadata"""
self.parent = parent
self.add_parent_id_to_metadata()
[docs]
def add_type(self, doc_type: Union[str, List[str]]) -> None:
doc_types = [doc_type] if isinstance(doc_type, str) else doc_type
if isinstance(self.type, str):
self.type = [self.type]
for doc_type in doc_types:
if doc_type not in self.type:
self.type.append(doc_type)
[docs]
def remove_type(self, doc_type: Union[str, List[str]]) -> None:
doc_types = [doc_type] if isinstance(doc_type, str) else doc_type
if isinstance(self.type, str):
self.type = [self.type]
for doc_type in doc_types:
if doc_type in self.type:
self.type.remove(doc_type)
if len(self.type) == 1:
self.type = self.type[0]
[docs]
def has_type(self, doc_type: str) -> bool:
if isinstance(self.type, str):
return doc_type == self.type
else:
return doc_type in self.type
@property
def types(self) -> Set[str]:
if isinstance(self.type, str):
return {self.type}
else:
return set(self.type)
[docs]
def set_as_parent(self, children: List[StructureDoc]):
"""Set this document as parent of a list of child documents"""
for child in children:
child.set_parent(self)
@property
def json(self) -> Dict[str, any]:
json_data = {
'id': self.id,
'type': self.type,
'metadata': self.metadata
}
if self.reading_order:
json_data['reading_order'] = self.reading_order
return json_data
[docs]
class PhysicalStructureDoc(StructureDoc):
def __init__(self, doc_id: str = None,
doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None,
coords: Coords = None,
reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type=doc_type, metadata=metadata, reading_order=reading_order)
self.coords: Union[None, Coords] = coords
self.main_type = 'physical_structure_doc'
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
if self.coords:
doc_json['coords'] = self.coords.points
return doc_json
[docs]
def set_derived_id(self, parent_id: str):
box_string = f"{self.coords.x}-{self.coords.y}-{self.coords.w}-{self.coords.h}"
self.id = f"{parent_id}-{self.main_type}-{box_string}"
# self.metadata['id'] = self.id
[docs]
class LogicalStructureDoc(StructureDoc):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, lines: List[PageXMLTextLine] = None,
text_regions: List[PageXMLTextRegion] = None, reading_order: Dict[int, str] = None):
super().__init__(doc_id, doc_type, metadata, reading_order=reading_order)
self.lines: List[PageXMLTextLine] = lines if lines else []
self.text_regions: List[PageXMLTextRegion] = text_regions if text_regions else []
self.logical_parent: Union[StructureDoc, None] = None
[docs]
def set_logical_parent(self, parent: StructureDoc):
"""Set parent document and add metadata of parent to this document's metadata"""
self.logical_parent = parent
self.add_logical_parent_id_to_metadata()
[docs]
def set_as_logical_parent(self, children: Union[StructureDoc, List[StructureDoc]]):
if isinstance(children, StructureDoc):
children = [children]
for child in children:
child.parent = self
[docs]
class PageXMLDoc(PhysicalStructureDoc):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type="pagexml_doc", metadata=metadata, reading_order=reading_order)
self.coords: Union[None, Coords] = coords
self.add_type(doc_type)
self.main_type = 'pagexml_doc'
@property
def stats(self):
return {}
[docs]
class PageXMLWord(PageXMLDoc):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None,
conf: float = None, text: str = None):
super().__init__(doc_id, "word", metadata, coords)
self.conf = conf
self.text = text
self.main_type = 'word'
if doc_type:
self.add_type(doc_type)
def __repr__(self):
content_string = f"id={self.id}, type={self.type}, text={self.text}"
if self.conf is not None:
content_string += f", conf={self.conf}"
return f"{self.__class__.__name__}({content_string})"
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
doc_json['text'] = self.text
if self.conf:
doc_json['conf'] = self.conf
return doc_json
[docs]
class PageXMLTextLine(PageXMLDoc):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None,
baseline: Baseline = None, xheight: int = None,
conf: float = None, text: str = None, words: List[PageXMLWord] = None,
reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type="line", metadata=metadata,
coords=coords, reading_order=reading_order)
self.main_type = 'line'
self.conf = conf
self.text: Union[None, str] = text
self.xheight: Union[None, int] = xheight
self.baseline: Union[None, Baseline] = baseline
self.words: List[PageXMLWord] = words if words else []
self.metadata['type'] = 'line'
self.set_as_parent(self.words)
if doc_type:
self.add_type(doc_type)
def __repr__(self):
content_string = f"id={self.id}, type={self.type}, text=\"{self.text}\" conf={self.conf}"
return f"{self.__class__.__name__}({content_string})"
def __lt__(self, other: PageXMLTextLine):
"""For sorting text lines. Assumptions: reading from left to right,
top to bottom. If two lines are horizontally overlapping, sort from
top to bottom, even if the upper lines is more horizontally indented."""
if other == self:
return False
return sort_lines(self, other, as_column=True)
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
doc_json['text'] = self.text
if self.conf is not None:
doc_json['conf'] = self.conf
if self.baseline:
doc_json['baseline'] = self.baseline.points
if self.words:
doc_json['words'] = [word.json for word in self.words]
if self.xheight:
doc_json['xheight'] = self.xheight
return doc_json
@property
def stats(self):
return {
'words': self.num_words
}
[docs]
def get_words(self):
if self.words:
return self.words
elif self.text:
return self.text.split(' ')
else:
return []
@property
def num_words(self):
return len(self.get_words())
[docs]
def is_below(self, other: PageXMLTextLine) -> bool:
"""Test if the baseline of this line is directly below the baseline of the other line."""
# if there is no horizontal overlap, this line is not directly below the other
if not get_horizontal_overlap(self, other):
# print("NO HORIZONTAL OVERLAP")
return False
# if the bottom of this line is above the top of the other line, this line is above the other
if self.baseline.bottom < other.baseline.top:
# print("BOTTOM IS ABOVE TOP")
return False
# if most of this line's baseline points are not below most the other's baseline points
# this line is not below the other
if baseline_is_below(self.baseline, other.baseline):
# print("BASELINE IS BELOW")
return True
return False
[docs]
def is_next_to(self, other: PageXMLTextLine) -> bool:
"""Test if this line is vertically aligned with the other line."""
if get_vertical_overlap(self, other) == 0:
# print("NO VERTICAL OVERLAP")
return False
if get_horizontal_overlap(self, other) > 40:
# print("TOO MUCH HORIZONTAL OVERLAP", horizontal_overlap(self.coords, other.coords))
return False
if self.baseline.top > other.baseline.bottom + 10:
# print("VERTICAL BASELINE GAP TOO BIG")
return False
elif self.baseline.bottom < other.baseline.top - 10:
return False
else:
return True
[docs]
class PageXMLTextRegion(PageXMLDoc):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None,
text_regions: List[PageXMLTextRegion] = None,
lines: List[PageXMLTextLine] = None, text: str = None,
orientation: float = None, reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type="text_region", metadata=metadata,
coords=coords, reading_order=reading_order)
self.main_type = 'text_region'
self.text_regions: List[PageXMLTextRegion] = text_regions if text_regions is not None else []
self.lines: List[PageXMLTextLine] = lines if lines is not None else []
self.orientation: Union[None, float] = orientation
self.reading_order_number = {}
self.text = text
if self.lines is not None:
self.set_as_parent(self.lines)
if self.lines is not None:
self.set_as_parent(self.lines)
if self.text_regions is not None:
self.set_as_parent(self.text_regions)
if self.reading_order:
self.set_text_regions_in_reader_order()
if doc_type:
self.add_type(doc_type)
def __repr__(self):
stats = json.dumps(self.stats)
content_string = f"\n\tid={self.id}, \n\ttype={self.type}, \n\tstats={stats}"
return f"{self.__class__.__name__}({content_string}\n)"
def __lt__(self, other: PageXMLTextRegion):
"""For sorting text regions. Assumptions: reading from left to right,
top to bottom. If two regions are horizontally overlapping, sort from
top to bottom, even if the upper region is more horizontally indented."""
if other == self:
return False
if is_horizontally_overlapping(self, other):
return self.coords.top < other.coords.top
else:
return self.coords.left < other.coords.left
[docs]
def add_child(self, child: PageXMLDoc):
child.set_parent(self)
if isinstance(child, PageXMLTextLine):
self.lines.append(child)
elif isinstance(child, PageXMLTextRegion):
self.text_regions.append(child)
else:
raise TypeError(f'unknown child type: {child.__class__.__name__}')
self.coords = parse_derived_coords(self.text_regions + self.lines)
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
if self.text:
doc_json['text'] = self.text
if self.lines:
doc_json['lines'] = [line.json for line in self.lines]
if self.text_regions:
doc_json['text_regions'] = [text_region.json for text_region in self.text_regions]
if self.orientation:
doc_json['orientation'] = self.orientation
doc_json['stats'] = self.stats
return doc_json
[docs]
def get_text_regions_in_reading_order(self):
if not self.reading_order:
return self.text_regions
tr_ids = list({region_id: None for _index, region_id in sorted(self.reading_order.items(), key=lambda x: x[0])})
tr_map = {}
for text_region in self.text_regions:
# if text_region.id not in tr_ids:
# print("reading order:", self.reading_order)
# raise KeyError(f"text_region with id {text_region.id} is not listed in reading_order")
tr_map[text_region.id] = text_region
return [tr_map[tr_id] for tr_id in tr_ids if tr_id in tr_map]
[docs]
def set_text_regions_in_reader_order(self):
tr_ids = [tr.id for tr in self.text_regions]
for order_number in self.reading_order:
text_region_id = self.reading_order[order_number]
self.reading_order_number[text_region_id] = order_number
for tr_id in tr_ids:
if tr_id not in self.reading_order_number:
# there is a text_region that was not in the original PageXML output:
# ignore reading order
self.reading_order = None
return None
self.text_regions = self.get_text_regions_in_reading_order()
[docs]
def get_all_text_regions(self):
text_regions: Set[PageXMLTextRegion] = set()
for text_region in self.text_regions:
text_regions.add(text_region)
if text_region.text_regions:
text_regions += text_region.get_all_text_regions()
return text_regions
[docs]
def get_inner_text_regions(self) -> List[PageXMLTextRegion]:
text_regions: List[PageXMLTextRegion] = []
for text_region in self.text_regions:
if text_region.text_regions:
text_regions += text_region.get_inner_text_regions()
elif text_region.lines:
text_regions.append(text_region)
if not self.text_regions and self.lines:
text_regions.append(self)
return text_regions
[docs]
def get_lines(self) -> List[PageXMLTextLine]:
lines: List[PageXMLTextLine] = []
if self.text_regions:
if self.reading_order and all([tr.id in self.reading_order for tr in self.text_regions]):
for tr in sorted(self.text_regions, key=lambda t: self.reading_order_number[t.id]):
lines += tr.get_lines()
else:
for text_region in sorted(self.text_regions):
lines += text_region.get_lines()
if self.lines:
lines += self.lines
return lines
[docs]
def get_words(self) -> Union[List[str], List[PageXMLWord]]:
words = []
if self.text is not None:
return self.text.split(' ')
if self.lines:
for line in self.lines:
if line.words:
words += line.words
elif line.text:
words += line.text.split(' ')
if self.text_regions:
for tr in self.text_regions:
words += tr.get_words()
return words
@property
def num_lines(self):
return len(self.get_lines())
@property
def num_words(self):
return len(self.get_words())
@property
def num_text_regions(self):
return len(self.text_regions)
@property
def stats(self):
return {
'lines': self.num_lines,
'words': self.num_words,
'text_regions': self.num_text_regions
}
[docs]
class PageXMLColumn(PageXMLTextRegion):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None,
text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None,
reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type="column", metadata=metadata, coords=coords, lines=lines,
text_regions=text_regions, reading_order=reading_order)
self.main_type = 'column'
if doc_type:
self.add_type(doc_type)
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
doc_json['stats'] = self.stats
return doc_json
@property
def stats(self):
stats = super().stats
return stats
[docs]
class PageXMLPage(PageXMLTextRegion):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None,
columns: List[PageXMLColumn] = None, text_regions: List[PageXMLTextRegion] = None,
extra: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None,
reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type="page", metadata=metadata, coords=coords, lines=lines,
text_regions=text_regions, reading_order=reading_order)
self.main_type = 'page'
self.columns: List[PageXMLColumn] = columns if columns else []
self.extra: List[PageXMLTextRegion] = extra if extra else []
self.set_as_parent(self.columns)
self.set_as_parent(self.extra)
if doc_type:
self.add_type(doc_type)
[docs]
def get_lines(self):
lines = []
if self.columns:
# First, add lines from columns
for column in sorted(self.columns):
lines += column.get_lines()
# Second, add lines from text_regions
if self.extra:
for tr in self.extra:
lines += tr.get_lines()
if self.text_regions:
# print('get_lines - reading_order_number:', self.reading_order_number)
# print('get_lines - reading_order:', self.reading_order)
if self.reading_order and all([tr.id in self.reading_order for tr in self.text_regions]):
for tr in sorted(self.text_regions, key=lambda t: self.reading_order_number[t]):
lines += tr.get_lines()
else:
for tr in sorted(self.text_regions):
lines += tr.get_lines()
if self.lines:
raise AttributeError(f'page {self.id} has lines as direct property')
return lines
[docs]
def add_child(self, child: PageXMLDoc, as_extra: bool = False):
# print('as_extra:', as_extra)
# print('stats before adding:', self.stats)
child.set_parent(self)
if as_extra and (isinstance(child, PageXMLColumn) or isinstance(child, PageXMLTextRegion)):
self.extra.append(child)
elif isinstance(child, PageXMLColumn) or child.__class__.__name__ == 'PageXMLColumn':
self.columns.append(child)
elif isinstance(child, PageXMLTextLine):
self.lines.append(child)
elif isinstance(child, PageXMLTextRegion):
self.text_regions.append(child)
else:
raise TypeError(f'unknown child type: {child.__class__.__name__}')
self.coords = parse_derived_coords(self.extra + self.columns + self.text_regions + self.lines)
# print('stats after adding:', self.stats)
[docs]
def get_all_text_regions(self):
text_regions = [tr for col in self.columns for tr in col.text_regions]
text_regions.extend([tr for tr in self.extra])
return text_regions
[docs]
def get_text_regions_in_reading_order(self, include_extra: bool = True):
text_regions = []
if len(self.text_regions) > 0:
text_regions.extend(self.text_regions)
if hasattr(self, 'columns'):
for col in sorted(self.columns):
text_regions.extend(col.get_text_regions_in_reading_order())
if include_extra and hasattr(self, 'extra'):
text_regions.extend(sorted(self.extra))
return text_regions
[docs]
def get_inner_text_regions(self) -> List[PageXMLTextRegion]:
text_regions = self.get_all_text_regions()
inner_trs = []
for tr in text_regions:
inner_trs.extend(tr.get_inner_text_regions())
return inner_trs
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
# if self.lines:
# doc_json['lines'] = [line.json for line in self.lines]
# if self.text_regions:
# doc_json['text_regions'] = [text_region.json for text_region in self.text_regions]
if self.columns:
doc_json['columns'] = [column.json for column in self.columns]
if self.extra:
doc_json['extra'] = [text_region.json for text_region in self.extra]
doc_json['stats'] = self.stats
return doc_json
@property
def stats(self):
"""Pages diverge from other types since they have columns and extra
text regions, or plain text regions, so have their own way of calculating
stats."""
lines = self.get_lines()
stats = {
"words": sum([len(line.get_words()) for line in lines]),
"lines": len(lines)
}
if self.columns:
stats['columns'] = len(self.columns)
if self.extra:
stats['extra'] = len(self.extra)
if self.text_regions:
stats['text_regions'] = len(self.text_regions)
return stats
[docs]
class PageXMLScan(PageXMLTextRegion):
def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None,
metadata: Dict[str, any] = None, coords: Coords = None,
pages: List[PageXMLPage] = None, columns: List[PageXMLColumn] = None,
text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None,
reading_order: Dict[int, str] = None):
super().__init__(doc_id=doc_id, doc_type="scan", metadata=metadata, coords=coords, lines=lines,
text_regions=text_regions, reading_order=reading_order)
self.main_type = 'scan'
self.pages: List[PageXMLPage] = pages if pages else []
self.columns: List[PageXMLColumn] = columns if columns else []
self.set_as_parent(self.pages)
self.set_as_parent(self.columns)
if doc_type:
self.add_type(doc_type)
self.set_scan_id_as_metadata()
[docs]
def add_child(self, child: PageXMLDoc):
child.set_parent(self)
if isinstance(child, PageXMLPage):
self.pages.append(child)
elif isinstance(child, PageXMLColumn):
self.columns.append(child)
elif isinstance(child, PageXMLTextRegion):
self.text_regions.append(child)
elif isinstance(child, PageXMLTextLine):
self.lines.append(child)
[docs]
def set_scan_id_as_metadata(self):
self.metadata['scan_id'] = self.id
for tr in self.get_all_text_regions():
tr.metadata['scan_id'] = self.id
for line in self.get_lines():
line.metadata['scan_id'] = self.id
for word in self.get_words():
if isinstance(word, PageXMLWord):
word.metadata['scan_id'] = self.id
@property
def json(self) -> Dict[str, any]:
doc_json = super().json
# if self.lines:
# doc_json['lines'] = [line.json for line in self.lines]
# if self.text_regions:
# doc_json['text_regions'] = [text_region.json for text_region in self.text_regions]
if self.columns:
doc_json['columns'] = [line.json for line in self.columns]
if self.pages:
doc_json['pages'] = [line.json for line in self.pages]
doc_json['stats'] = self.stats
return doc_json
@property
def stats(self):
stats = super().stats
stats['columns'] = len([column for page in self.pages for column in page.columns])
stats['extra'] = len([text_region for page in self.pages for text_region in page.extra])
stats['pages'] = len(self.pages)
return stats
[docs]
def set_parentage(parent_doc: StructureDoc):
if hasattr(parent_doc, 'pages') and parent_doc.pages:
parent_doc.set_as_parent(parent_doc.pages)
for page in parent_doc.pages:
set_parentage(page)
if hasattr(parent_doc, 'columns') and parent_doc.columns:
parent_doc.set_as_parent(parent_doc.columns)
for column in parent_doc.columns:
set_parentage(column)
if hasattr(parent_doc, 'text_regions') and parent_doc.text_regions:
parent_doc.set_as_parent(parent_doc.text_regions)
for text_region in parent_doc.text_regions:
set_parentage(text_region)
if hasattr(parent_doc, 'lines') and parent_doc.lines:
parent_doc.set_as_parent(parent_doc.lines)
for line in parent_doc.lines:
set_parentage(line)
if hasattr(parent_doc, 'words') and parent_doc.words:
parent_doc.set_as_parent(parent_doc.words)
for word in parent_doc.words:
set_parentage(word)
[docs]
def in_same_column(element1: PageXMLDoc, element2: PageXMLDoc) -> bool:
"""Check if two PageXML elements are part of the same column."""
if (
'scan_id' in element1.metadata
and 'scan_id' in element2.metadata
and element1.metadata['scan_id'] != element2.metadata['scan_id']
):
return False
if 'column_id' in element1.metadata and 'column_id' in element2.metadata:
return element1.metadata['column_id'] == element2.metadata['column_id']
else:
# check if the two lines have a horizontal overlap that is more than 50% of the width of line 1
# Note: this doesn't work for short adjacent lines within the same column
return get_horizontal_overlap(element1, element2) > (element1.coords.w / 2)