Extracting tables

Extracting tables

The table information in the API response is stored in two parts:

  • Table cell values (content)
  • Table structure (annotations)

Below is a code example that takes the output value from the API response and converts it into multiple grids, where a grid is a list of rows and each row is a list of cell values (see build_table_grids).

from collections import defaultdict
from typing import Any, Dict, List, Sequence, Set, Tuple
 
 
DATA = "data"
SPAN = "span"
INDEX = "index"
TYPE = "type"
CHILDREN = "children"
TABLE_CELL = "TABLE_CELL"
TABLE = "TABLE"
CONTENT_UIDS = "content_uids"
TABLE_STRUCTURE = "table_structure"
CONTENT_TREE = "content_tree"
CONTENT = "content"
ANNOTATIONS = "annotations"
UID = "uid"
 
 
def _get_table_shape(table_structure_annotations: Sequence[Dict[str, Any]]) -> Tuple[int, int]:
    """Get table shape from table structure annotations."""
    if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in table_structure_annotations):
        raise ValueError("Table grid can only be built from table structure annotations.")
    n_rows = max(
        annotation[DATA][INDEX][0] + annotation[DATA][SPAN][0]
        for annotation in table_structure_annotations
    )
    n_cols = max(
        annotation[DATA][INDEX][1] + annotation[DATA][SPAN][1]
        for annotation in table_structure_annotations
    )
    return n_rows, n_cols
 
 
def _check_complete_set(integer_set: Set[int]) -> bool:
    """Check that the set of integers contains all integers between 0 and its max."""
    return len(integer_set) == len(set(range(max(integer_set) + 1)))
 
 
def _validate_annotations(duplicated_annotations: List[Dict[str, Any]]) -> None:
    """Validate duplicated annotations."""
    # check all spans are 1 (annotations are duplicated)
    all_spans = [annotation[DATA][SPAN] for annotation in duplicated_annotations]
    if any(span != (1, 1) for span in all_spans):
        raise ValueError("Un-duplicated merged cells in table.")
    # check no overlap
    all_indices = [annotation[DATA][INDEX] for annotation in duplicated_annotations]
    if len(set(all_indices)) != len(all_indices):
        raise ValueError("Overlapping indices in table.")
    # check no empty rows / columns
    all_rows = set(index[0] for index in all_indices)
    all_columns = set(index[1] for index in all_indices)
    if not _check_complete_set(all_rows):
        raise ValueError("Empty row in table.")
    if not _check_complete_set(all_columns):
        raise ValueError("Empty column in table.")
 
 
def _get_table_uid_to_cells_mapping(content: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
    """Recursively get table uids to cells mapping from nested structured document."""
    current_mapping = {}
    if content[TYPE] == TABLE:
        # termination condition
        cells = [child for child in content[CHILDREN] if child[TYPE] == TABLE_CELL]
        current_mapping[content[UID]] = cells
    elif len(content[CHILDREN]) > 0:
        for child in content[CHILDREN]:
            # recursive call
            nested_mapping = _get_table_uid_to_cells_mapping(child)
            current_mapping.update(nested_mapping)
    return current_mapping
 
 
def _get_table_uid_to_annotations_mapping(
    table_uid_to_cells: Dict[str, List[Dict[str, Any]]],
    table_cell_annotations: List[Dict[str, Any]],
) -> Dict[str, List[Dict[str, Any]]]:
    """Get table uid to table structure annotations mapping."""
    uid_to_annotation: Dict[str, Dict[str, Any]] = {
        annotation[CONTENT_UIDS][0]: annotation for annotation in table_cell_annotations
    }
    table_to_annotations = {}
    for table_uid, cells in table_uid_to_cells.items():
        cell_uids = [cell[UID] for cell in cells]
        table_to_annotations[table_uid] = [uid_to_annotation[uid] for uid in cell_uids]
    return table_to_annotations
 
 
def _duplicate_annotations(
    annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[Dict[str, Any]]:
    """Get duplicated annotations.
    Returns a list of annotations with span (1, 1). Input annotations which span on more than
    one on row / column, are duplicated. Input annotations
    Args:
    annotations: annotations to duplicate
    duplicate_content_flag: if True, duplicate text box content into all spanned table cells.
        If False, only fill the top left cell. Other spanned cells will be empty.
    Returns:
    duplicated annotations. Duplicated annotations must all have span (1, 1).
    """
    duplicated_annotations = []
    for annotation in annotations:
        data = annotation[DATA]
        row_span, col_span = data[SPAN]
        row_index, col_index = data[INDEX]
        for row_span_index in range(row_span):
            for col_span_index in range(col_span):
                if duplicate_content_flag or (row_span_index == 0 and col_span_index == 0):
                    content_uids = annotation[CONTENT_UIDS]
                else:
                    content_uids = []
                new_annotation = {
                    TYPE: annotation[TYPE],
                    CONTENT_UIDS: content_uids,
                    DATA: {
                        SPAN: (1, 1),
                        INDEX: (row_index + row_span_index, col_index + col_span_index),
                    },
                }
                duplicated_annotations.append(new_annotation)
    _validate_annotations(duplicated_annotations)
    return duplicated_annotations
 
 
def _build_grid_from_table_cell_annotations(
    annotations: List[Dict[str, Any]], duplicate_content_flag: bool = False
) -> List[List[List[str]]]:
    """Grid where each location has a list of content uids."""
    if any(annotation[TYPE] != TABLE_STRUCTURE for annotation in annotations):
        raise ValueError("Table grid can only be built from table structure annotations.")
 
    duplicated_annotations = _duplicate_annotations(annotations, duplicate_content_flag)
    index_to_uids_mapping = defaultdict(
        list,
        {
            annotation[DATA][INDEX]: annotation[CONTENT_UIDS]
            for annotation in duplicated_annotations
        },
    )
 
    n_rows, n_cols = _get_table_shape(duplicated_annotations)
 
    rows: List[List[List[str]]] = []
    for row_index in range(n_rows):
        current_row = []
        for col_index in range(n_cols):
            current_row.append(index_to_uids_mapping[(row_index, col_index)])
        rows.append(current_row)
    return rows
 
 
def _convert_uid_grid_to_content_grid(
    uid_grid: List[List[List[str]]], cell_contents: List[Dict[str, Any]]
) -> List[List[str]]:
    """Convert a UID grid to content grid."""
    uids_to_content = {cell[UID]: cell[CONTENT] for cell in cell_contents}
    content_grid = []
    for uid_row in uid_grid:
        content_row = []
        for content_uids in uid_row:
            if len(content_uids) > 0:
                first_content_uid = content_uids[0]
                text = uids_to_content[first_content_uid]
            else:
                text = ""
            content_row.append(text)
        content_grid.append(content_row)
    return content_grid
 
 
def build_table_grids(
    serialized_document: Dict[str, Any], duplicate_merged_cells_content_flag: bool = False
) -> Dict[str, List[List[str]]]:
    """Convert serialized tables to grid of strings.
    Args:
        serialized_document: a serialized document.
        duplicate_merged_cells_content_flag: if True, duplicate cell content for merged cells. If False,
            only fill the first cell (top left) of the merged area, other cells are empty.
    Returns:
            a mapping of table UIDs to table grid structures
    """
    annotations = serialized_document[ANNOTATIONS]
    content = serialized_document[CONTENT_TREE]
 
    table_uid_to_cells_mapping = _get_table_uid_to_cells_mapping(content)
    table_cell_annotations = [
        annotation for annotation in annotations if annotation[TYPE] == TABLE_STRUCTURE
    ]
    table_uid_to_cell_annotations = _get_table_uid_to_annotations_mapping(
        table_uid_to_cells_mapping, table_cell_annotations
    )
 
    tables = {}
    for table_uid, annotations in table_uid_to_cell_annotations.items():
        grid = _build_grid_from_table_cell_annotations(
            annotations, duplicate_content_flag=duplicate_merged_cells_content_flag
        )
        cell_contents = table_uid_to_cells_mapping[table_uid]
        content_grid = _convert_uid_grid_to_content_grid(grid, cell_contents)
        tables[table_uid] = content_grid
    return tables

build_table_grids returns a mapping (dictionary) from the table identifiers to corresponding table grids. Below is an example of using this function to extract table grids from an API response.

serialized_document = response["output"]
table_grids = build_table_grids(serialized_document)

Converting to CSV

In order to convert a grid into a spreadsheet table, we recommend using Pandas (opens in a new tab) (Python data analysis library). Below is an example of converting a grid into a pandas DataFrame object and saving the result into CSV.

import pandas as pd
 
table_id = "5" # The desired key from table_grids.keys()
table_grid = table_grids[table_id]
table_df = pd.DataFrame(table_grid)
table_df.to_csv("sample_table.csv")