Accessing Extract with a trial account
This guide is meant for users who would like to access Extract programatically. If you do not yet have an account, sign up (opens in a new tab) for access.
To get started, visit the authenication quickstart to learn how to generate access tokens.
Once you have an access token, you can send documents for extraction. The recommended approach uses the upload URL flow, which supports larger documents and provides pre-signed URLs for both upload and download.
This requires calling the endpoints in order:
/v3/extractions/upload-url— create the request and get a pre-signed upload URL- Upload the document to the pre-signed URL
/v3/extractions/upload-complete— notify that the upload is finished so extraction can begin/v3/extractions/download-url/{request_id}— get a pre-signed URL to download the extracted output
import time
import requests
base_url = "https://extract.kensho.com"
headers = {"Authorization": f"Bearer {access_token}"}
# Step 1: Create the extraction request and get the upload URL
response = requests.post(
f"{base_url}/v3/extractions/upload-url",
data={
"output_format": "structured_document_with_locations",
"document_type": "hierarchical",
"ocr": "false",
"enhanced_table_extraction": "true",
},
headers=headers,
)
response.raise_for_status()
result = response.json()
request_id = result["request_id"]
upload_url = result["upload_spec"]["url"]
upload_fields = result["upload_spec"]["fields"]
# Step 2: Upload the document to the pre-signed URL
with open(filename, "rb") as f:
upload_response = requests.post(
upload_url,
data=upload_fields,
files={"file": f},
)
upload_response.raise_for_status()
# Step 3: Mark the upload as complete to start extraction
complete_response = requests.put(
f"{base_url}/v3/extractions/upload-complete",
data={"request_id": request_id},
headers=headers,
)
complete_response.raise_for_status()
# Step 4: Poll for completion, then download the result
print(f"Waiting for extraction {request_id}")
while True:
download_response = requests.get(
f"{base_url}/v3/extractions/download-url/{request_id}",
headers=headers,
)
download_response.raise_for_status()
download_result = download_response.json()
if download_result["status"] == "success":
break
elif download_result["status"] == "failed":
raise RuntimeError(f"Extraction failed: {download_result.get('error')}")
time.sleep(2)
# Download the extracted output from the pre-signed output URL
output_response = requests.get(download_result["output_url"])
output_response.raise_for_status()
print(output_response.json())Using the Direct Upload Flow
Alternatively, you can submit a document directly via multipart form data to /v3/extractions, but this is not recommended since large documents could take a long time to process:
request_url = "https://extract.kensho.com/v3/extractions"
headers = {"Authorization": f"Bearer {access_token}"}
print("Sending a document to extract")
response = requests.post(
request_url,
files=dict(file=open(filename, 'rb')),
data={
# "document_type" should be "hierarchical", "hierarchical_v2", or "general"
"document_type": "hierarchical",
# If the document is known to be scanned, set ocr to "true".
# If the document is know to be a native pdf, set ocr to "false".
"ocr": "false",
# To turn off the enhanced table extraction model, set enhanced_table_extraction to "false".
"enhanced_table_extraction": "true",
# To enable figure extraction, set figure_extraction to "true". It is false by default and is
# not required to be specified.
# "figure_extraction": "false",
},
headers=headers,
)
response.raise_for_status()
request_id = response.json()["request_id"]
response_url = f"{request_url}/{request_id}"
params = {}
# To include bounding boxes in the output, uncomment the following:
# params["output_format"] = "structured_document_with_locations"
print("Waiting for job %s", request_id)
response = requests.get(response_url, headers=headers, params=params)
while response.status_code == 200 and response.json()['status'] == 'pending':
time.sleep(2)
response = requests.get(response_url, headers=headers, params=params)
if response.status_code == 200:
print(response.json())Using Extract Output with an LLM
The Kensho Extract API can be used to extract data from documents and then answer questions using a large language model (LLM). This is a powerful combination that allows you to leverage the strengths of both technologies.
Here is an example of how to use the Kensho Extract API to summarize a document with GPT-4o:
# Parse the JSON response from the Extract API into Markdown
from kensho_kenverters.convert_output import convert_output_to_markdown
markdown_output = convert_output_to_markdown(response.json()["output"])
# Use the markdown_output to ask questions to an LLM
import openai
import os
## Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
def get_gpt4o_response(query, context):
"""
Sends a query to GPT-4o with context and returns the response.
Args:
query (str): The user's question or request.
context (str): Additional information to provide context for the model.
Returns:
str: The generated response from GPT-4o.
"""
try:
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": context + "\n" + query},
],
)
return response.choices[0].message.content
except Exception as e:
return f"An error occurred: {e}"
## Query using Extract output as context
context_text = markdown_output
query_text = "Summarize the main points of the document."
gpt4o_response = get_gpt4o_response(query_text, context_text)
print(gpt4o_response)