Chevron RightKensho ExtractChevron Right



Extracting tables

The table information in the API response is stored in two parts:

  • table cell values (content)
  • table structure (annotations)

Below is a code example that takes the output value from the API response (what we call structured_document in the example), and converts it into multiple grids, where a grid is a list of rows and each row is a list of cell values (see build_table_grids).

from collections import defaultdict
from typing import Any, Dict, List, Sequence, Set, Tuple
DATA = "data"
SPAN = "span"
INDEX = "index"
TYPE = "type"
CHILDREN = "children"
CONTENT_UIDS = "content_uids"
TABLE_STRUCTURE = "table_structure"
CONTENT_TREE = "content_tree"
CONTENT = "content"
ANNOTATIONS = "annotations"
UID = "uid"
def _get_table_shape(table_structure_annotations: Sequence[Dict[str, Any]]) -> Tuple[int, int]:
"""Get table shape from table structure annotations."""
if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in table_structure_annotations):
raise ValueError("Table grid can only be built from table structure annotations.")
n_rows = max(
annotation[DATA][INDEX][0] + annotation[DATA][SPAN][0]
for annotation in table_structure_annotations
n_cols = max(
annotation[DATA][INDEX][1] + annotation[DATA][SPAN][1]
for annotation in table_structure_annotations
return n_rows, n_cols
def _check_complete_set(integer_set: Set[int]) -> bool:
"""Check that the set of integers contains all integers between 0 and its max."""
return len(integer_set) == len(set(range(max(integer_set) + 1)))
def _validate_annotations(duplicated_annotations: List[Dict[str, Any]]) -> None:
"""Validate duplicated annotations."""
# check all spans are 1 (annotations are duplicated)
all_spans = [annotation[DATA][SPAN] for annotation in duplicated_annotations]
if any(span != (1, 1) for span in all_spans):
raise ValueError("Un-duplicated merged cells in table.")
# check no overlap
all_indices = [annotation[DATA][INDEX] for annotation in duplicated_annotations]
if len(set(all_indices)) != len(all_indices):
raise ValueError("Overlapping indices in table.")
# check no empty rows / columns
all_rows = set(index[0] for index in all_indices)
all_columns = set(index[1] for index in all_indices)
if not _check_complete_set(all_rows):
raise ValueError("Empty row in table.")
if not _check_complete_set(all_columns):
raise ValueError("Empty column in table.")
def _get_table_uid_to_cells_mapping(content: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
"""Recursively get table uids to cells mapping from nested structured document."""
current_mapping = {}
if content[TYPE] == TABLE:
# termination condition 1
cells = [child for child in content[CHILDREN] if child[TYPE] == TABLE_CELL]
current_mapping[content[UID]] = cells
elif len(content[CHILDREN]) == 0:
# termination condition 2
for child in content[CHILDREN]:
# recursive call
nested_mapping = _get_table_uid_to_cells_mapping(child)
return current_mapping
def _get_table_uid_to_annotations_mapping(
table_uid_to_cells: Dict[str, List[Dict[str, Any]]],
table_cell_annotations: List[Dict[str, Any]],
) -> Dict[str, List[Dict[str, Any]]]:
"""Get table uid to table structure annotations mapping."""
uid_to_annotation: Dict[str, Dict[str, Any]] = {
annotation[CONTENT_UIDS][0]: annotation for annotation in table_cell_annotations
table_to_annotations = {}
for table_uid, cells in table_uid_to_cells.items():
cell_uids = [cell[UID] for cell in cells]
table_to_annotations[table_uid] = [uid_to_annotation[uid] for uid in cell_uids]
return table_to_annotations
def _duplicate_annotations(
annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[Dict[str, Any]]:
"""Get duplicated annotations.
Returns a list of annotations with span (1, 1). Input annotations which span on more than
one on row / column, are duplicated. Input annotations
annotations: annotations to duplicate
duplicate_content_flag: if True, duplicate text box content into all spanned table cells.
If False, only fill the top left cell. Other spanned cells will be empty.
duplicated annotations. Duplicated annotations must all have span (1, 1).
duplicated_annotations = []
for annotation in annotations:
data = annotation[DATA]
row_span, col_span = data[SPAN]
row_index, col_index = data[INDEX]
for row_span_index in range(row_span):
for col_span_index in range(col_span):
if duplicate_content_flag or (row_span_index == 0 and col_span_index == 0):
content_uids = annotation[CONTENT_UIDS]
content_uids = []
new_annotation = {
TYPE: annotation[TYPE],
CONTENT_UIDS: content_uids,
SPAN: (1, 1),
INDEX: (row_index + row_span_index, col_index + col_span_index),
return duplicated_annotations
def _build_grid_from_table_cell_annotations(
annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[List[List[str]]]:
"""Grid where each location has a list of content uids."""
if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in annotations):
raise ValueError("Table grid can only be built from table structure annotations.")
duplicated_annotations = _duplicate_annotations(annotations, duplicate_content_flag)
index_to_uids_mapping = defaultdict(
annotation[DATA][INDEX]: annotation[CONTENT_UIDS]
for annotation in duplicated_annotations
n_rows, n_cols = _get_table_shape(duplicated_annotations)
rows: List[List[List[str]]] = []
for row_index in range(n_rows):
current_row = []
for col_index in range(n_cols):
current_row.append(index_to_uids_mapping[(row_index, col_index)])
return rows
def _convert_uid_grid_to_content_grid(
uid_grid: List[List[List[str]]], cell_contents: List[Dict[str, Any]]
) -> List[List[str]]:
"""Convert a UID grid to content grid."""
uids_to_content = {cell[UID]: cell[CONTENT] for cell in cell_contents}
content_grid = []
for uid_row in uid_grid:
content_row = []
for content_uids in uid_row:
if len(content_uids) > 0:
first_content_uid = content_uids[0]
text = uids_to_content[first_content_uid]
text = ""
return content_grid
def build_table_grids(
serialized_document: Dict[str, Any], duplicate_merged_cells_content_flag: bool = False
) -> Dict[str, List[List[str]]]:
"""Convert serialized tables to grid of strings.
serialized_document: a serialized document.
duplicate_merged_cells_content_flag: if True, duplicate cell content for merged cells. If False,
only fill the first cell (top left) of the merged area, other cells are empty.
a mapping of table UIDs to table grid structures
annotations = serialized_document[ANNOTATIONS]
content = serialized_document[CONTENT_TREE]
table_uid_to_cells_mapping = _get_table_uid_to_cells_mapping(content)
table_cell_annotations = [
annotation for annotation in annotations if annotation[TYPE] == TABLE_STRUCTURE
table_uid_to_cell_annotations = _get_table_uid_to_annotations_mapping(
table_uid_to_cells_mapping, table_cell_annotations
tables = {}
for table_uid, annotations in table_uid_to_cell_annotations.items():
grid = _build_grid_from_table_cell_annotations(
annotations, duplicate_content_flag=duplicate_merged_cells_content_flag
cell_contents = table_uid_to_cells_mapping[table_uid]
content_grid = _convert_uid_grid_to_content_grid(grid, cell_contents)
tables[table_uid] = content_grid
return tables

build_table_grids function returns a mapping (dictionary) from the table identifiers to corresponding table grids. Below is an example of using this function to extract table grids from an API response.

serialized_document = response["output"]
table_grids = build_table_grids(serialized_document)

Converting to CSV

In order to convert a grid into a spreadsheet table, we recommend using Pandas (Python data analysis library). Below is an example of converting a grid into a pandas DataFrame object and saving the result into CSV.

table_id = "5"
table_grid = table_grids["5"]
table_df = pd.DataFrame(table_grid)