Chevron RightKensho ExtractChevron Right

Guides

Search

Extracting tables

The table information in the API response is stored in two parts:

  • table cell values (content)
  • table structure (annotations)

Below is a code example that takes the output value from the API response (what we call structured_document in the example), and converts it into multiple grids, where a grid is a list of rows and each row is a list of cell values (see build_table_grids).

from collections import defaultdict
from typing import Any, Dict, List, Sequence, Set, Tuple
DATA = "data"
SPAN = "span"
INDEX = "index"
TYPE = "type"
CHILDREN = "children"
TABLE_CELL = "TABLE_CELL"
TABLE = "TABLE"
CONTENT_UIDS = "content_uids"
TABLE_STRUCTURE = "table_structure"
CONTENT = "content"
ANNOTATIONS = "annotations"
UID = "uid"
def _get_table_shape(table_structure_annotations: Sequence[Dict[str, Any]]) -> Tuple[int, int]:
"""Get table shape from table structure annotations."""
if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in table_structure_annotations):
raise ValueError("Table grid can only be built from table structure annotations.")
n_rows = max(
annotation[DATA][INDEX][0] + annotation[DATA][SPAN][0]
for annotation in table_structure_annotations
)
n_cols = max(
annotation[DATA][INDEX][1] + annotation[DATA][SPAN][1]
for annotation in table_structure_annotations
)
return n_rows, n_cols
def _check_complete_set(integer_set: Set[int]) -> bool:
"""Check that the set of integers contains all integers between 0 and its max."""
return len(integer_set) == len(set(range(max(integer_set) + 1)))
def _validate_annotations(duplicated_annotations: List[Dict[str, Any]]) -> None:
"""Validate duplicated annotations."""
# check all spans are 1 (annotations are duplicated)
all_spans = [annotation[DATA][SPAN] for annotation in duplicated_annotations]
if any(span != (1, 1) for span in all_spans):
raise ValueError("Un-duplicated merged cells in table.")
# check no overlap
all_indices = [annotation[DATA][INDEX] for annotation in duplicated_annotations]
if len(set(all_indices)) != len(all_indices):
raise ValueError("Overlapping indices in table.")
# check no empty rows / columns
all_rows = set(index[0] for index in all_indices)
all_columns = set(index[1] for index in all_indices)
if not _check_complete_set(all_rows):
raise ValueError("Empty row in table.")
if not _check_complete_set(all_columns):
raise ValueError("Empty column in table.")
def _get_table_uid_to_cells_mapping(content: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
"""Recursively get table uids to cells mapping from nested structured document."""
current_mapping = {}
if content[TYPE] == TABLE:
# termination condition 1
cells = [child for child in content[CHILDREN] if child[TYPE] == TABLE_CELL]
current_mapping[content[UID]] = cells
elif len(content[CHILDREN]) == 0:
# termination condition 2
pass
else:
for child in content[CHILDREN]:
# recursive call
nested_mapping = _get_table_uid_to_cells_mapping(child)
current_mapping.update(nested_mapping)
return current_mapping
def _get_table_uid_to_annotations_mapping(
table_uid_to_cells: Dict[str, List[Dict[str, Any]]],
table_cell_annotations: List[Dict[str, Any]],
) -> Dict[str, List[Dict[str, Any]]]:
"""Get table uid to table structure annotations mapping."""
uid_to_annotation: Dict[str, Dict[str, Any]] = {
annotation[CONTENT_UIDS][0]: annotation for annotation in table_cell_annotations
}
table_to_annotations = {}
for table_uid, cells in table_uid_to_cells.items():
cell_uids = [cell[UID] for cell in cells]
table_to_annotations[table_uid] = [uid_to_annotation[uid] for uid in cell_uids]
return table_to_annotations
def _duplicate_annotations(
annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[Dict[str, Any]]:
"""Get duplicated annotations.
Returns a list of annotations with span (1, 1). Input annotations which span on more than
one on row / column, are duplicated. Input annotations
Args:
annotations: annotations to duplicate
duplicate_content_flag: if True, duplicate text box content into all spanned table cells.
If False, only fill the top left cell. Other spanned cells will be empty.
Returns:
duplicated annotations. Duplicated annotations must all have span (1, 1).
"""
duplicated_annotations = []
for annotation in annotations:
data = annotation[DATA]
row_span, col_span = data[SPAN]
row_index, col_index = data[INDEX]
for row_span_index in range(row_span):
for col_span_index in range(col_span):
if duplicate_content_flag or (row_span_index == 0 and col_span_index == 0):
content_uids = annotation[CONTENT_UIDS]
else:
content_uids = []
new_annotation = {
TYPE: annotation[TYPE],
CONTENT_UIDS: content_uids,
DATA: {
SPAN: (1, 1),
INDEX: (row_index + row_span_index, col_index + col_span_index),
},
}
duplicated_annotations.append(new_annotation)
_validate_annotations(duplicated_annotations)
return duplicated_annotations
def _build_grid_from_table_cell_annotations(
annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[List[List[str]]]:
"""Grid where each location has a list of content uids."""
if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in annotations):
raise ValueError("Table grid can only be built from table structure annotations.")
duplicated_annotations = _duplicate_annotations(annotations, duplicate_content_flag)
index_to_uids_mapping = defaultdict(
list,
{
annotation[DATA][INDEX]: annotation[CONTENT_UIDS]
for annotation in duplicated_annotations
},
)
n_rows, n_cols = _get_table_shape(duplicated_annotations)
rows: List[List[List[str]]] = []
for row_index in range(n_rows):
current_row = []
for col_index in range(n_cols):
current_row.append(index_to_uids_mapping[(row_index, col_index)])
rows.append(current_row)
return rows
def _convert_uid_grid_to_content_grid(
uid_grid: List[List[List[str]]], cell_contents: List[Dict[str, Any]]
) -> List[List[str]]:
"""Convert a UID grid to content grid."""
uids_to_content = {cell[UID]: cell[CONTENT] for cell in cell_contents}
content_grid = []
for uid_row in uid_grid:
content_row = []
for content_uids in uid_row:
if len(content_uids) > 0:
first_content_uid = content_uids[0]
text = uids_to_content[first_content_uid]
else:
text = ""
content_row.append(text)
content_grid.append(content_row)
return content_grid
def build_table_grids(
serialized_document: Dict[str, Any], duplicate_merged_cells_content_flag: bool = False
) -> Dict[str, List[List[str]]]:
"""Convert serialized tables to grid of strings.
Args:
serialized_document: a serialized document.
duplicate_merged_cells_content_flag: if True, duplicate cell content for merged cells. If False,
only fill the first cell (top left) of the merged area, other cells are empty.
Returns:
a mapping of table UIDs to table grid structures
"""
annotations = serialized_document[ANNOTATIONS]
content = serialized_document[CONTENT]
table_uid_to_cells_mapping = _get_table_uid_to_cells_mapping(content)
table_cell_annotations = [
annotation for annotation in annotations if annotation[TYPE] == TABLE_STRUCTURE
]
table_uid_to_cell_annotations = _get_table_uid_to_annotations_mapping(
table_uid_to_cells_mapping, table_cell_annotations
)
tables = {}
for table_uid, annotations in table_uid_to_cell_annotations.items():
grid = _build_grid_from_table_cell_annotations(
annotations, duplicate_content_flag=duplicate_merged_cells_content_flag
)
cell_contents = table_uid_to_cells_mapping[table_uid]
content_grid = _convert_uid_grid_to_content_grid(grid, cell_contents)
tables[table_uid] = content_grid
return tables

build_table_grids function returns a mapping (dictionary) from the table identifiers to corresponding table grids. Below is an example of using this function to extract table grids from an API response.

serialized_document = response["output"]
table_grids = build_table_grids(serialized_document)

Converting to CSV

In order to convert a grid into a spreadsheet table, we recommend using Pandas (Python data analysis library). Below is an example of converting a grid into a pandas DataFrame object and saving the result into CSV.

table_id = "5"
table_grid = table_grids["5"]
table_df = pd.DataFrame(table_grid)
table_df.to_csv("sample_table.csv")