Extracting tables

Extracting tables

The table information in the API response is stored in two parts:

  • Table cell values (content)
  • Table structure (annotations)

Below is a code example that takes the output value from the API response and converts it into multiple grids, where a grid is a list of rows and each row is a list of cell values (see build_table_grids).

from collections import defaultdict
from typing import Any, Dict, List, Sequence, Set, Tuple
DATA = "data"
SPAN = "span"
INDEX = "index"
TYPE = "type"
CHILDREN = "children"
CONTENT_UIDS = "content_uids"
TABLE_STRUCTURE = "table_structure"
CONTENT_TREE = "content_tree"
CONTENT = "content"
ANNOTATIONS = "annotations"
UID = "uid"
def _get_table_shape(table_structure_annotations: Sequence[Dict[str, Any]]) -> Tuple[int, int]:
    """Get table shape from table structure annotations."""
    if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in table_structure_annotations):
        raise ValueError("Table grid can only be built from table structure annotations.")
    n_rows = max(
        annotation[DATA][INDEX][0] + annotation[DATA][SPAN][0]
        for annotation in table_structure_annotations
    n_cols = max(
        annotation[DATA][INDEX][1] + annotation[DATA][SPAN][1]
        for annotation in table_structure_annotations
    return n_rows, n_cols
def _check_complete_set(integer_set: Set[int]) -> bool:
    """Check that the set of integers contains all integers between 0 and its max."""
    return len(integer_set) == len(set(range(max(integer_set) + 1)))
def _validate_annotations(duplicated_annotations: List[Dict[str, Any]]) -> None:
    """Validate duplicated annotations."""
    # check all spans are 1 (annotations are duplicated)
    all_spans = [annotation[DATA][SPAN] for annotation in duplicated_annotations]
    if any(span != (1, 1) for span in all_spans):
        raise ValueError("Un-duplicated merged cells in table.")
    # check no overlap
    all_indices = [annotation[DATA][INDEX] for annotation in duplicated_annotations]
    if len(set(all_indices)) != len(all_indices):
        raise ValueError("Overlapping indices in table.")
    # check no empty rows / columns
    all_rows = set(index[0] for index in all_indices)
    all_columns = set(index[1] for index in all_indices)
    if not _check_complete_set(all_rows):
        raise ValueError("Empty row in table.")
    if not _check_complete_set(all_columns):
        raise ValueError("Empty column in table.")
def _get_table_uid_to_cells_mapping(content: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
    """Recursively get table uids to cells mapping from nested structured document."""
    current_mapping = {}
    if content[TYPE] == TABLE:
        # termination condition
        cells = [child for child in content[CHILDREN] if child[TYPE] == TABLE_CELL]
        current_mapping[content[UID]] = cells
    elif len(content[CHILDREN]) > 0:
        for child in content[CHILDREN]:
            # recursive call
            nested_mapping = _get_table_uid_to_cells_mapping(child)
    return current_mapping
def _get_table_uid_to_annotations_mapping(
    table_uid_to_cells: Dict[str, List[Dict[str, Any]]],
    table_cell_annotations: List[Dict[str, Any]],
) -> Dict[str, List[Dict[str, Any]]]:
    """Get table uid to table structure annotations mapping."""
    uid_to_annotation: Dict[str, Dict[str, Any]] = {
        annotation[CONTENT_UIDS][0]: annotation for annotation in table_cell_annotations
    table_to_annotations = {}
    for table_uid, cells in table_uid_to_cells.items():
        cell_uids = [cell[UID] for cell in cells]
        table_to_annotations[table_uid] = [uid_to_annotation[uid] for uid in cell_uids]
    return table_to_annotations
def _duplicate_annotations(
    annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[Dict[str, Any]]:
    """Get duplicated annotations.
    Returns a list of annotations with span (1, 1). Input annotations which span on more than
    one on row / column, are duplicated. Input annotations
    annotations: annotations to duplicate
    duplicate_content_flag: if True, duplicate text box content into all spanned table cells.
        If False, only fill the top left cell. Other spanned cells will be empty.
    duplicated annotations. Duplicated annotations must all have span (1, 1).
    duplicated_annotations = []
    for annotation in annotations:
        data = annotation[DATA]
        row_span, col_span = data[SPAN]
        row_index, col_index = data[INDEX]
        for row_span_index in range(row_span):
            for col_span_index in range(col_span):
                if duplicate_content_flag or (row_span_index == 0 and col_span_index == 0):
                    content_uids = annotation[CONTENT_UIDS]
                    content_uids = []
                new_annotation = {
                    TYPE: annotation[TYPE],
                    CONTENT_UIDS: content_uids,
                    DATA: {
                        SPAN: (1, 1),
                        INDEX: (row_index + row_span_index, col_index + col_span_index),
    return duplicated_annotations
def _build_grid_from_table_cell_annotations(
    annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[List[List[str]]]:
    """Grid where each location has a list of content uids."""
    if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in annotations):
        raise ValueError("Table grid can only be built from table structure annotations.")
    duplicated_annotations = _duplicate_annotations(annotations, duplicate_content_flag)
    index_to_uids_mapping = defaultdict(
            annotation[DATA][INDEX]: annotation[CONTENT_UIDS]
            for annotation in duplicated_annotations
    n_rows, n_cols = _get_table_shape(duplicated_annotations)
    rows: List[List[List[str]]] = []
    for row_index in range(n_rows):
        current_row = []
        for col_index in range(n_cols):
            current_row.append(index_to_uids_mapping[(row_index, col_index)])
    return rows
def _convert_uid_grid_to_content_grid(
    uid_grid: List[List[List[str]]], cell_contents: List[Dict[str, Any]]
) -> List[List[str]]:
    """Convert a UID grid to content grid."""
    uids_to_content = {cell[UID]: cell[CONTENT] for cell in cell_contents}
    content_grid = []
    for uid_row in uid_grid:
        content_row = []
        for content_uids in uid_row:
            if len(content_uids) > 0:
                first_content_uid = content_uids[0]
                text = uids_to_content[first_content_uid]
                text = ""
    return content_grid
def build_table_grids(
    serialized_document: Dict[str, Any], duplicate_merged_cells_content_flag: bool = False
) -> Dict[str, List[List[str]]]:
    """Convert serialized tables to grid of strings.
        serialized_document: a serialized document.
        duplicate_merged_cells_content_flag: if True, duplicate cell content for merged cells. If False,
            only fill the first cell (top left) of the merged area, other cells are empty.
            a mapping of table UIDs to table grid structures
    annotations = serialized_document[ANNOTATIONS]
    content = serialized_document[CONTENT_TREE]
    table_uid_to_cells_mapping = _get_table_uid_to_cells_mapping(content)
    table_cell_annotations = [
        annotation for annotation in annotations if annotation[TYPE] == TABLE_STRUCTURE
    table_uid_to_cell_annotations = _get_table_uid_to_annotations_mapping(
        table_uid_to_cells_mapping, table_cell_annotations
    tables = {}
    for table_uid, annotations in table_uid_to_cell_annotations.items():
        grid = _build_grid_from_table_cell_annotations(
            annotations, duplicate_content_flag=duplicate_merged_cells_content_flag
        cell_contents = table_uid_to_cells_mapping[table_uid]
        content_grid = _convert_uid_grid_to_content_grid(grid, cell_contents)
        tables[table_uid] = content_grid
    return tables

build_table_grids returns a mapping (dictionary) from the table identifiers to corresponding table grids. Below is an example of using this function to extract table grids from an API response.

serialized_document = response["output"]
table_grids = build_table_grids(serialized_document)

Converting to CSV

In order to convert a grid into a spreadsheet table, we recommend using Pandas (opens in a new tab) (Python data analysis library). Below is an example of converting a grid into a pandas DataFrame object and saving the result into CSV.

import pandas as pd
table_id = "5" # The desired key from table_grids.keys()
table_grid = table_grids[table_id]
table_df = pd.DataFrame(table_grid)