pagexml.model.physical_document_model module

class pagexml.model.physical_document_model.Baseline(points: str | List[Tuple[int, int]])[source]

Bases: Coords

class pagexml.model.physical_document_model.Coords(points: str | List[Tuple[int, int]])[source]

Bases: object

property bottom
property box
property height
property json
property left
property right
property top
property width
class pagexml.model.physical_document_model.LogicalStructureDoc(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, lines: List[PageXMLTextLine] = None, text_regions: List[PageXMLTextRegion] = None, reading_order: Dict[int, str] = None)[source]

Bases: StructureDoc

add_logical_parent_id_to_metadata()[source]
set_as_logical_parent(children: StructureDoc | List[StructureDoc])[source]
set_logical_parent(parent: StructureDoc)[source]

Set parent document and add metadata of parent to this document’s metadata

class pagexml.model.physical_document_model.PageXMLColumn(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, reading_order: Dict[int, str] = None)[source]

Bases: PageXMLTextRegion

property json: Dict[str, any]
property stats
class pagexml.model.physical_document_model.PageXMLDoc(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None)[source]

Bases: PhysicalStructureDoc

property stats
class pagexml.model.physical_document_model.PageXMLPage(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, columns: List[PageXMLColumn] = None, text_regions: List[PageXMLTextRegion] = None, extra: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, reading_order: Dict[int, str] = None)[source]

Bases: PageXMLTextRegion

add_child(child: PageXMLDoc, as_extra: bool = False)[source]
get_all_text_regions()[source]
get_inner_text_regions() List[PageXMLTextRegion][source]
get_lines()[source]
get_text_regions_in_reading_order(include_extra: bool = True)[source]
property json: Dict[str, any]
property stats

Pages diverge from other types since they have columns and extra text regions, or plain text regions, so have their own way of calculating stats.

class pagexml.model.physical_document_model.PageXMLScan(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, pages: List[PageXMLPage] = None, columns: List[PageXMLColumn] = None, text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, reading_order: Dict[int, str] = None)[source]

Bases: PageXMLTextRegion

add_child(child: PageXMLDoc)[source]
property json: Dict[str, any]
set_scan_id_as_metadata()[source]
property stats
class pagexml.model.physical_document_model.PageXMLTextLine(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, baseline: Baseline = None, xheight: int = None, conf: float = None, text: str = None, words: List[PageXMLWord] = None, reading_order: Dict[int, str] = None)[source]

Bases: PageXMLDoc

get_words()[source]
is_below(other: PageXMLTextLine) bool[source]

Test if the baseline of this line is directly below the baseline of the other line.

is_next_to(other: PageXMLTextLine) bool[source]

Test if this line is vertically aligned with the other line.

property json: Dict[str, any]
property num_words
property stats
class pagexml.model.physical_document_model.PageXMLTextRegion(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, text_regions: List[PageXMLTextRegion] = None, lines: List[PageXMLTextLine] = None, text: str = None, orientation: float = None, reading_order: Dict[int, str] = None)[source]

Bases: PageXMLDoc

add_child(child: PageXMLDoc)[source]
get_all_text_regions()[source]
get_inner_text_regions() List[PageXMLTextRegion][source]
get_lines() List[PageXMLTextLine][source]
get_text_regions_in_reading_order()[source]
get_words() List[str] | List[PageXMLWord][source]
property json: Dict[str, any]
property num_lines
property num_text_regions
property num_words
set_text_regions_in_reader_order()[source]
property stats
class pagexml.model.physical_document_model.PageXMLWord(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, conf: float = None, text: str = None)[source]

Bases: PageXMLDoc

property json: Dict[str, any]
class pagexml.model.physical_document_model.PhysicalStructureDoc(doc_id: str = None, doc_type: str | List[str] = None, metadata: Dict[str, any] = None, coords: Coords = None, reading_order: Dict[int, str] = None)[source]

Bases: StructureDoc

property json: Dict[str, any]
set_derived_id(parent_id: str)[source]
class pagexml.model.physical_document_model.StructureDoc(doc_id: None | str = None, doc_type: None | str | List[str] = None, metadata: Dict[str, any] = None, reading_order: Dict[int, str] = None)[source]

Bases: object

add_parent_id_to_metadata()[source]
add_type(doc_type: str | List[str]) None[source]
has_type(doc_type: str) bool[source]
property json: Dict[str, any]
remove_type(doc_type: str | List[str]) None[source]
set_as_parent(children: List[StructureDoc])[source]

Set this document as parent of a list of child documents

set_parent(parent: StructureDoc)[source]

Set parent document and add metadata of parent to this document’s metadata

property types: Set[str]
pagexml.model.physical_document_model.baseline_is_below(baseline1: Baseline, baseline2: Baseline) bool[source]

Test if baseline 1 is directly below baseline 2

pagexml.model.physical_document_model.coords_list_to_hull_coords(coords_list)[source]
pagexml.model.physical_document_model.edges_to_hull_points(edges)[source]
pagexml.model.physical_document_model.find_baseline_overlap_start_indexes(baseline1: Baseline, baseline2: Baseline) Tuple[int, int][source]

Find the first point in each baseline where the two start to horizontally overlap.

pagexml.model.physical_document_model.get_horizontal_diff(doc1: PageXMLDoc, doc2: PageXMLDoc) int[source]
pagexml.model.physical_document_model.get_horizontal_diff_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) float[source]
pagexml.model.physical_document_model.get_horizontal_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) int[source]
pagexml.model.physical_document_model.get_horizontal_overlap_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) float[source]
pagexml.model.physical_document_model.get_vertical_diff(doc1: PageXMLDoc, doc2: PageXMLDoc) int[source]
pagexml.model.physical_document_model.get_vertical_diff_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) float[source]
pagexml.model.physical_document_model.get_vertical_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) int[source]
pagexml.model.physical_document_model.get_vertical_overlap_ratio(doc1: PageXMLDoc, doc2: PageXMLDoc) float[source]
pagexml.model.physical_document_model.has_baseline(doc: PageXMLDoc) bool[source]
pagexml.model.physical_document_model.horizontal_distance(doc1: PageXMLDoc, doc2: PageXMLDoc)[source]
pagexml.model.physical_document_model.in_same_column(element1: PageXMLDoc, element2: PageXMLDoc) bool[source]

Check if two PageXML elements are part of the same column.

pagexml.model.physical_document_model.is_below(region1: PageXMLTextRegion, region2: PageXMLTextRegion, margin: int = 20) bool[source]
pagexml.model.physical_document_model.is_horizontally_overlapping(region1: PageXMLDoc, region2: PageXMLDoc, threshold: float = 0.5) bool[source]
pagexml.model.physical_document_model.is_next_to(region1: PageXMLTextRegion, region2: PageXMLTextRegion, margin: int = 20) bool[source]
pagexml.model.physical_document_model.is_vertically_overlapping(region1: PageXMLDoc, region2: PageXMLDoc, threshold: float = 0.5) bool[source]
pagexml.model.physical_document_model.parse_derived_coords(document_list: list) Coords[source]

Derive scan coordinates for a composite document based on the list of documents it contains. A convex hull is drawn around all points of all contained documents.

pagexml.model.physical_document_model.parse_points(points: str | List[Tuple[int, int]]) List[Tuple[int, int]][source]

Parse a string of PageXML image coordinates into a list of coordinates.

pagexml.model.physical_document_model.points_to_hull_edges(points)[source]
pagexml.model.physical_document_model.set_parentage(parent_doc: StructureDoc)[source]
pagexml.model.physical_document_model.sort_lines(line1: PageXMLTextLine, line2: PageXMLTextLine, as_column: bool = True)[source]
pagexml.model.physical_document_model.vertical_distance(doc1: PageXMLDoc, doc2: PageXMLDoc)[source]