Extracting tables
The table information in the API response is stored in two parts:
- table cell values (
content
) - table structure (
annotations
)
Below is a code example that takes the output value from the API response (what we call structured_document in the example), and converts it into multiple grids, where a grid is a list of rows and each row is a list of cell values (see build_table_grids
).
from collections import defaultdictfrom typing import Any, Dict, List, Sequence, Set, TupleDATA = "data"SPAN = "span"INDEX = "index"TYPE = "type"CHILDREN = "children"TABLE_CELL = "TABLE_CELL"TABLE = "TABLE"CONTENT_UIDS = "content_uids"TABLE_STRUCTURE = "table_structure"CONTENT_TREE = "content_tree"CONTENT = "content"ANNOTATIONS = "annotations"UID = "uid"def _get_table_shape(table_structure_annotations: Sequence[Dict[str, Any]]) -> Tuple[int, int]:"""Get table shape from table structure annotations."""if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in table_structure_annotations):raise ValueError("Table grid can only be built from table structure annotations.")n_rows = max(annotation[DATA][INDEX][0] + annotation[DATA][SPAN][0]for annotation in table_structure_annotations)n_cols = max(annotation[DATA][INDEX][1] + annotation[DATA][SPAN][1]for annotation in table_structure_annotations)return n_rows, n_colsdef _check_complete_set(integer_set: Set[int]) -> bool:"""Check that the set of integers contains all integers between 0 and its max."""return len(integer_set) == len(set(range(max(integer_set) + 1)))def _validate_annotations(duplicated_annotations: List[Dict[str, Any]]) -> None:"""Validate duplicated annotations."""# check all spans are 1 (annotations are duplicated)all_spans = [annotation[DATA][SPAN] for annotation in duplicated_annotations]if any(span != (1, 1) for span in all_spans):raise ValueError("Un-duplicated merged cells in table.")# check no overlapall_indices = [annotation[DATA][INDEX] for annotation in duplicated_annotations]if len(set(all_indices)) != len(all_indices):raise ValueError("Overlapping indices in table.")# check no empty rows / columnsall_rows = set(index[0] for index in all_indices)all_columns = set(index[1] for index in all_indices)if not _check_complete_set(all_rows):raise ValueError("Empty row in table.")if not _check_complete_set(all_columns):raise ValueError("Empty column in table.")def _get_table_uid_to_cells_mapping(content: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:"""Recursively get table uids to cells mapping from nested structured document."""current_mapping = {}if content[TYPE] == TABLE:# termination condition 1cells = [child for child in content[CHILDREN] if child[TYPE] == TABLE_CELL]current_mapping[content[UID]] = cellselif len(content[CHILDREN]) == 0:# termination condition 2passelse:for child in content[CHILDREN]:# recursive callnested_mapping = _get_table_uid_to_cells_mapping(child)current_mapping.update(nested_mapping)return current_mappingdef _get_table_uid_to_annotations_mapping(table_uid_to_cells: Dict[str, List[Dict[str, Any]]],table_cell_annotations: List[Dict[str, Any]],) -> Dict[str, List[Dict[str, Any]]]:"""Get table uid to table structure annotations mapping."""uid_to_annotation: Dict[str, Dict[str, Any]] = {annotation[CONTENT_UIDS][0]: annotation for annotation in table_cell_annotations}table_to_annotations = {}for table_uid, cells in table_uid_to_cells.items():cell_uids = [cell[UID] for cell in cells]table_to_annotations[table_uid] = [uid_to_annotation[uid] for uid in cell_uids]return table_to_annotationsdef _duplicate_annotations(annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False) -> List[Dict[str, Any]]:"""Get duplicated annotations.Returns a list of annotations with span (1, 1). Input annotations which span on more thanone on row / column, are duplicated. Input annotationsArgs:annotations: annotations to duplicateduplicate_content_flag: if True, duplicate text box content into all spanned table cells.If False, only fill the top left cell. Other spanned cells will be empty.Returns:duplicated annotations. Duplicated annotations must all have span (1, 1)."""duplicated_annotations = []for annotation in annotations:data = annotation[DATA]row_span, col_span = data[SPAN]row_index, col_index = data[INDEX]for row_span_index in range(row_span):for col_span_index in range(col_span):if duplicate_content_flag or (row_span_index == 0 and col_span_index == 0):content_uids = annotation[CONTENT_UIDS]else:content_uids = []new_annotation = {TYPE: annotation[TYPE],CONTENT_UIDS: content_uids,DATA: {SPAN: (1, 1),INDEX: (row_index + row_span_index, col_index + col_span_index),},}duplicated_annotations.append(new_annotation)_validate_annotations(duplicated_annotations)return duplicated_annotationsdef _build_grid_from_table_cell_annotations(annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False) -> List[List[List[str]]]:"""Grid where each location has a list of content uids."""if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in annotations):raise ValueError("Table grid can only be built from table structure annotations.")duplicated_annotations = _duplicate_annotations(annotations, duplicate_content_flag)index_to_uids_mapping = defaultdict(list,{annotation[DATA][INDEX]: annotation[CONTENT_UIDS]for annotation in duplicated_annotations},)n_rows, n_cols = _get_table_shape(duplicated_annotations)rows: List[List[List[str]]] = []for row_index in range(n_rows):current_row = []for col_index in range(n_cols):current_row.append(index_to_uids_mapping[(row_index, col_index)])rows.append(current_row)return rowsdef _convert_uid_grid_to_content_grid(uid_grid: List[List[List[str]]], cell_contents: List[Dict[str, Any]]) -> List[List[str]]:"""Convert a UID grid to content grid."""uids_to_content = {cell[UID]: cell[CONTENT] for cell in cell_contents}content_grid = []for uid_row in uid_grid:content_row = []for content_uids in uid_row:if len(content_uids) > 0:first_content_uid = content_uids[0]text = uids_to_content[first_content_uid]else:text = ""content_row.append(text)content_grid.append(content_row)return content_griddef build_table_grids(serialized_document: Dict[str, Any], duplicate_merged_cells_content_flag: bool = False) -> Dict[str, List[List[str]]]:"""Convert serialized tables to grid of strings.Args:serialized_document: a serialized document.duplicate_merged_cells_content_flag: if True, duplicate cell content for merged cells. If False,only fill the first cell (top left) of the merged area, other cells are empty.Returns:a mapping of table UIDs to table grid structures"""annotations = serialized_document[ANNOTATIONS]content = serialized_document[CONTENT_TREE]table_uid_to_cells_mapping = _get_table_uid_to_cells_mapping(content)table_cell_annotations = [annotation for annotation in annotations if annotation[TYPE] == TABLE_STRUCTURE]table_uid_to_cell_annotations = _get_table_uid_to_annotations_mapping(table_uid_to_cells_mapping, table_cell_annotations)tables = {}for table_uid, annotations in table_uid_to_cell_annotations.items():grid = _build_grid_from_table_cell_annotations(annotations, duplicate_content_flag=duplicate_merged_cells_content_flag)cell_contents = table_uid_to_cells_mapping[table_uid]content_grid = _convert_uid_grid_to_content_grid(grid, cell_contents)tables[table_uid] = content_gridreturn tables
build_table_grids
function returns a mapping (dictionary) from the table identifiers to corresponding table grids. Below is an example of using this function to extract table grids from an API response.
serialized_document = response["output"]table_grids = build_table_grids(serialized_document)
Converting to CSV
In order to convert a grid into a spreadsheet table, we recommend using Pandas (Python data analysis library). Below is an example of converting a grid into a pandas DataFrame
object and saving the result into CSV.
table_id = "5"table_grid = table_grids["5"]table_df = pd.DataFrame(table_grid)table_df.to_csv("sample_table.csv")