V3 (latest)
Quickstart

Accessing Extract with a trial account

This guide is meant for users who would like to access Extract programatically. If you do not yet have an account, sign up (opens in a new tab) for access.

To get started, visit the authenication quickstart to learn how to generate access tokens.

Once you have an access token, you can send documents for extraction.

request_url = "https://extract.kensho.com/v3/extractions"
headers = {"Authorization": f"Bearer {access_token}"}
print("Sending a document to extract")
response = requests.post(
    request_url,
    files=dict(file=open(filename, 'rb')),
    data={
        # "document_type" should be "hierarchical" or "general"
        "document_type": "hierarchical",
        # If the document is scanned, set ocr to "true". If it is a native pdf, set ocr to "false"
        "ocr": "true",
        # To enable the enhanced table extraction, set enhanced_table_extraction to "true"
        "enhanced_table_extraction": "false",
    },
    headers=headers,
)
response.raise_for_status()
 
request_id = response.json()["request_id"]
response_url = f"{request_url}/{request_id}"
 
params = {}
# To include bounding boxes in the output, uncomment the following:
# params["output_format"] = "structured_document_with_locations"
 
print("Waiting for job %s", request_id)
response = requests.get(response_url, headers=headers, params=params)
 
while response.status_code == 200 and response.json()['status'] == 'pending':
    time.sleep(2)
    response = requests.get(response_url, headers=headers, params=params)
 
if response.status_code == 200:
    print(response.json())