Accessing Extract with a trial account
This guide is meant for users who would like to access Extract programatically. If you do not yet have an account, sign up (opens in a new tab) for access.
To get started, visit the authenication quickstart to learn how to generate access tokens.
Once you have an access token, you can send documents for extraction.
Python
request_url = "https://extract.kensho.com/v3/extractions"
headers = {"Authorization": f"Bearer {access_token}"}
print("Sending a document to extract")
response = requests.post(
request_url,
files=dict(file=open(filename, 'rb')),
data={
# "document_type" should be "hierarchical", "hierarchical_v2", or "general"
"document_type": "hierarchical",
# If the document is known to be scanned, set ocr to "true".
# If the document is know to be a native pdf, set ocr to "false".
"ocr": "false",
# To turn off the enhanced table extraction model, set enhanced_table_extraction to "false".
"enhanced_table_extraction": "true",
# To enable figure extraction, set figure_extraction to "true". It is false by default and is
# not required to be specified.
# "figure_extraction": "false",
},
headers=headers,
)
response.raise_for_status()
request_id = response.json()["request_id"]
response_url = f"{request_url}/{request_id}"
params = {}
# To include bounding boxes in the output, uncomment the following:
# params["output_format"] = "structured_document_with_locations"
print("Waiting for job %s", request_id)
response = requests.get(response_url, headers=headers, params=params)
while response.status_code == 200 and response.json()['status'] == 'pending':
time.sleep(2)
response = requests.get(response_url, headers=headers, params=params)
if response.status_code == 200:
print(response.json())