API
Subpackages
- pagexml.analysis package
- Submodules
- pagexml.analysis.layout_stats module
average_baseline_height()categorise_line_width()compute_baseline_distances()compute_bounding_box_distances()compute_columns_stats()compute_height_stats()compute_lines_stats()compute_pages_stats()compute_pagexml_stats()compute_points_distances()compute_scans_stats()compute_textregion_distance()compute_textregions_stats()find_line_width_boundary_points()find_lowest_point()get_baseline_y()get_bottom_points()get_boundary_width_ranges()get_line_distances()get_line_height_stats()get_line_width_stats()get_line_widths()get_text_heights()get_textregion_avg_char_width()get_textregion_avg_line_distance()get_textregion_avg_line_width()get_textregion_line_distances()interpolate_baseline_points()interpolate_points()line_starts_with_big_capital()sort_coords_above_below_baseline()
- pagexml.analysis.stats module
- pagexml.analysis.text_stats module
LineAnalyserLineCharAnalyserLineWordAnalyserWordBreakDetectorcompute_complement_keyness()compute_expected()compute_keyness()compute_log_likelihood()determine_word_break()determine_word_break_typical_merge_end()end_is_common_word()end_start_are_bigram()end_start_are_hyphenated_compound()get_doc_words()get_keyness_vocab()get_line_text()get_observed()get_typical_start_end_words()get_word_cat_stats()get_words_per_line()has_common_merge_end()has_non_merge_word()has_word_break_symbol()is_non_mid_word()make_line_analyser()merge_analysers()merge_is_more_common()show_word_break_context()start_is_titleword()start_word_has_incorrect_titlecase()
- pagexml.analysis.layout_stats module
- Submodules
- pagexml.helper package
- Submodules
- pagexml.helper.file_helper module
- pagexml.helper.pagexml_helper module
LineIterablecombine_adjacent_lines()elements_overlap()get_custom_tags()horizontal_group_lines()horizontally_merge_lines()line_ends_with_word_break()make_line_range()make_line_text()make_text_region_text()merge_lines()merge_sets()merge_textregions()pagexml_to_line_format()pretty_print_textregion()print_textregion_stats()read_line_format_file()sort_lines_in_column_reading_order()sort_lines_in_reading_direction()sort_lines_in_reading_order()sort_lines_in_row_reading_order()sort_regions_in_reading_order()write_pagexml_to_line_format()
- pagexml.helper.text_helper module
LineReaderfind_term_in_context()get_bbox()get_line_format_json()get_line_format_tsv()get_line_words()get_page_lines_words()make_line_format_file()make_list()make_skipgram_similarity_dict()read_lines_from_line_files()read_pagexml_docs_from_line_file()remove_hyphen()remove_word_break_chars()split_line_words()transform_box_to_coords()
- Submodules
- pagexml.model package
- Submodules
- pagexml.model.physical_document_model module
BaselineCoordsLogicalStructureDocPageXMLColumnPageXMLDocPageXMLPagePageXMLScanPageXMLTextLinePageXMLTextRegionPageXMLWordPhysicalStructureDocStructureDocbaseline_is_below()coords_list_to_hull_coords()edges_to_hull_points()find_baseline_overlap_start_indexes()get_horizontal_diff()get_horizontal_diff_ratio()get_horizontal_overlap()get_horizontal_overlap_ratio()get_vertical_diff()get_vertical_diff_ratio()get_vertical_overlap()get_vertical_overlap_ratio()has_baseline()horizontal_distance()in_same_column()is_below()is_horizontally_overlapping()is_next_to()is_vertically_overlapping()parse_derived_coords()parse_points()points_to_hull_edges()set_parentage()sort_lines()vertical_distance()
- pagexml.model.physical_document_model module
- Submodules
- pagexml.plotting package
Submodules
- pagexml.column_parser module
column_bounding_box_surrounds_lines()compute_pixel_dist()determine_column_type()determine_freq_gap_interval()find_column_gaps()find_overlapping_columns()handle_extra_lines()is_full_text_column()is_header_footer_column()is_noise_column()is_text_column()make_column_range_columns()make_derived_column()merge_columns()merge_overlapping_columns()new_gap_pixel_interval()sort_lines_in_column_ranges()split_lines_on_column_gaps()within_column()
- pagexml.pagexml_tools module
- pagexml.parser module
json_to_column_container()json_to_pagexml_column()json_to_pagexml_doc()json_to_pagexml_line()json_to_pagexml_page()json_to_pagexml_scan()json_to_pagexml_text_region()json_to_pagexml_word()parse_baseline()parse_conf()parse_coords()parse_custom_metadata()parse_custom_metadata_element()parse_custom_metadata_element_list()parse_line_words()parse_page_image_size()parse_page_metadata()parse_page_reading_order()parse_pagexml_file()parse_pagexml_files()parse_pagexml_files_from_archive()parse_pagexml_files_from_directory()parse_pagexml_from_json()parse_pagexml_json()parse_text_equiv()parse_textline()parse_textline_list()parse_textregion()parse_textregion_list()read_pagexml_dirs()read_pagexml_file()