Extraction
This guide covers the Extraction API, which lets you define named metadata fields and have the AI extract their values from your indexed documents. Files must be indexed as EliseFiles first — see File Indexation.
How It Works
Extraction runs as an asynchronous job, following the same pattern as Question Answering. You define metadata fields to extract, target a set of documents, and retrieve the structured results once the job completes.
The key difference from QA: instead of natural-language questions, you define metadata fields (metas) with a name, a description, and an expected data type. The platform extracts a typed value for each field from each document.
Creating an Extraction Job
- cURL
- Python (httpx)
- R
- SDK
curl -X POST "https://<api-domain>/api/core/extraction/jobs" \
-H "Authorization: Bearer <your-pat>" \
-H "Content-Type: application/json" \
-d '{
"files": {
"fileIds": ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"],
"collectionIds": []
},
"metas": [
{
"meta": "document_title",
"description": "The full title of the document",
"answerType": { "dataType": "STRING", "multiValued": false }
},
{
"meta": "study_year",
"description": "The year the study was conducted or published",
"answerType": { "dataType": "INT", "multiValued": false }
},
{
"meta": "risk_level",
"description": "The assessed risk level",
"answerType": {
"dataType": "ENUM",
"multiValued": false,
"enumValues": ["LOW", "MEDIUM", "HIGH"]
}
}
]
}'
from pydantic import BaseModel, Field
class AnswerType(BaseModel):
data_type: str = Field(alias="dataType")
multi_valued: bool = Field(default=False, alias="multiValued")
enum_values: list[str] | None = Field(default=None, alias="enumValues")
class MetaInput(BaseModel):
meta: str
answer_type: AnswerType = Field(alias="answerType")
description: str | None = None
class FilesInput(BaseModel):
file_ids: list[str] = Field(default_factory=list, alias="fileIds")
collection_ids: list[str] = Field(default_factory=list, alias="collectionIds")
class Job(BaseModel):
job_id: str = Field(alias="jobId")
status: str
error_message: str | None = Field(default=None, alias="errorMessage")
response = client.post(
"/extraction/jobs",
json={
"files": {
"fileIds": ["a1b2c3d4-e5f6-7890-abcd-ef1234567890"],
"collectionIds": [],
},
"metas": [
{
"meta": "document_title",
"description": "The full title of the document",
"answerType": {"dataType": "STRING", "multiValued": False},
},
{
"meta": "study_year",
"description": "The year the study was conducted or published",
"answerType": {"dataType": "INT", "multiValued": False},
},
{
"meta": "risk_level",
"description": "The assessed risk level",
"answerType": {
"dataType": "ENUM",
"multiValued": False,
"enumValues": ["LOW", "MEDIUM", "HIGH"],
},
},
],
},
)
response.raise_for_status()
job = Job.model_validate(response.json())
print(f"Job created: {job.job_id} (status: {job.status})")
resp <- base_req |>
req_url_path_append("extraction", "jobs") |>
req_body_json(list(
files = list(
fileIds = list("a1b2c3d4-e5f6-7890-abcd-ef1234567890"),
collectionIds = list()
),
metas = list(
list(
meta = "document_title",
description = "The full title of the document",
answerType = list(dataType = "STRING", multiValued = FALSE)
),
list(
meta = "study_year",
description = "The year the study was conducted or published",
answerType = list(dataType = "INT", multiValued = FALSE)
),
list(
meta = "risk_level",
description = "The assessed risk level",
answerType = list(
dataType = "ENUM",
multiValued = FALSE,
enumValues = list("LOW", "MEDIUM", "HIGH")
)
)
)
)) |>
req_perform()
job <- resp_body_json(resp)
cat(sprintf("Job created: %s (status: %s)\n", job$jobId, job$status))
import asyncio
from biolevate import BiolevateClient, MetaInput
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
job = await client.extraction.create_job(
metas=[
MetaInput(
meta="document_title",
answer_type={"dataType": "STRING", "multiValued": False},
description="The full title of the document",
),
MetaInput(
meta="study_year",
answer_type={"dataType": "INT", "multiValued": False},
description="The year the study was conducted or published",
),
MetaInput(
meta="risk_level",
answer_type={
"dataType": "ENUM",
"multiValued": False,
"enumValues": ["LOW", "MEDIUM", "HIGH"],
},
description="The assessed risk level",
),
],
file_ids=["a1b2c3d4-e5f6-7890-abcd-ef1234567890"],
)
print(f"Job created: {job.job_id} (status: {job.status})")
asyncio.run(main())
Defining Metadata Fields
Each field in the metas array is an EliseMetaInput:
| Field | Required | Description |
|---|---|---|
meta | Yes | Machine-readable field name (e.g. "document_title") |
answerType | Yes | Expected data type — see Answer Types |
description | No | Human-readable description to guide the AI extraction |
Answer Types
The answerType.dataType field specifies the extracted value format:
dataType | Description | Populated field in DataValue |
|---|---|---|
STRING | Free text | strValue |
INT | Integer number | longValue |
FLOAT | Decimal number | doubleValue |
BOOL | Boolean value | boolValue |
DATE | Date or datetime | dateValue |
ENUM | One of a fixed set of values | strValue |
Set multiValued: true to extract a list of values (e.g. multiple authors). The list variant of the corresponding field is then populated (strListValue, longListValue, etc.).
Polling Job Status
The job runs asynchronously. Poll until status is SUCCESS or FAILED.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/extraction/jobs/${JOB_ID}" \
-H "Authorization: Bearer <your-pat>"
import time
def wait_for_job(job_id: str, poll_interval: float = 3.0) -> Job:
while True:
response = client.get(f"/extraction/jobs/{job_id}")
response.raise_for_status()
job = Job.model_validate(response.json())
print(f"Status: {job.status}")
if job.status in ("SUCCESS", "FAILED", "ABORTED"):
return job
time.sleep(poll_interval)
completed_job = wait_for_job(job.job_id)
if completed_job.status != "SUCCESS":
print(f"Job failed: {completed_job.error_message}")
wait_for_job <- function(job_id, poll_interval = 3) {
repeat {
resp <- base_req |>
req_url_path_append("extraction", "jobs", job_id) |>
req_perform()
job <- resp_body_json(resp)
cat(sprintf("Status: %s\n", job$status))
if (job$status %in% c("SUCCESS", "FAILED", "ABORTED")) {
return(job)
}
Sys.sleep(poll_interval)
}
}
completed_job <- wait_for_job(job$jobId)
import asyncio
from biolevate import BiolevateClient
TERMINAL_STATUSES = {"SUCCESS", "FAILED", "ABORTED"}
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
job_id = "<your-job-id>"
while True:
job = await client.extraction.get_job(job_id)
print(f"Status: {job.status}")
if job.status in TERMINAL_STATUSES:
break
await asyncio.sleep(3)
asyncio.run(main())
Retrieving Results
Once the job is SUCCESS, retrieve the extracted values.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/extraction/jobs/${JOB_ID}/results" \
-H "Authorization: Bearer <your-pat>"
from pydantic import BaseModel, Field
from typing import Any
class DataValue(BaseModel):
str_value: str | None = Field(default=None, alias="strValue")
bool_value: bool | None = Field(default=None, alias="boolValue")
long_value: int | None = Field(default=None, alias="longValue")
double_value: float | None = Field(default=None, alias="doubleValue")
date_value: str | None = Field(default=None, alias="dateValue")
str_list_value: list[str] | None = Field(default=None, alias="strListValue")
long_list_value: list[int] | None = Field(default=None, alias="longListValue")
double_list_value: list[float] | None = Field(default=None, alias="doubleListValue")
date_list_value: list[str] | None = Field(default=None, alias="dateListValue")
def resolved(self) -> Any:
"""Return the first non-None value."""
for v in [
self.str_value, self.bool_value, self.long_value,
self.double_value, self.date_value, self.str_list_value,
self.long_list_value, self.double_list_value, self.date_list_value,
]:
if v is not None:
return v
return None
class AnnotationId(BaseModel):
id: str
entity_type: str = Field(alias="entityType")
class MetaResult(BaseModel):
meta: str | None = None
answer: DataValue | None = None
raw_value: object | None = Field(default=None, alias="rawValue")
explanation: str | None = None
reference_ids: list[AnnotationId] = Field(default_factory=list, alias="referenceIds")
class ExtractJobOutputs(BaseModel):
results: list[MetaResult] = Field(default_factory=list)
response = client.get(f"/extraction/jobs/{job.job_id}/results")
response.raise_for_status()
outputs = ExtractJobOutputs.model_validate(response.json())
for result in outputs.results:
value = result.answer.resolved() if result.answer else None
print(f"{result.meta}: {value}")
if result.explanation:
print(f" Explanation: {result.explanation}")
resp <- base_req |>
req_url_path_append("extraction", "jobs", job$jobId, "results") |>
req_perform()
outputs <- resp_body_json(resp)
resolve_value <- function(answer) {
for (field in c("strValue", "boolValue", "longValue", "doubleValue",
"dateValue", "strListValue", "longListValue",
"doubleListValue", "dateListValue")) {
if (!is.null(answer[[field]])) return(answer[[field]])
}
return(NA)
}
for (result in outputs$results) {
value <- if (!is.null(result$answer)) resolve_value(result$answer) else NA
cat(sprintf("%s: %s\n", result$meta, as.character(value)))
if (!is.null(result$explanation)) {
cat(sprintf(" Explanation: %s\n", result$explanation))
}
}
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
job_id = "<your-job-id>"
outputs = await client.extraction.get_job_outputs(job_id)
for result in outputs.results:
print(f"{result.meta}: {result.raw_value}")
if result.explanation:
print(f" Explanation: {result.explanation}")
asyncio.run(main())
Understanding the Result Fields
Each EliseMetaResult contains:
| Field | Description |
|---|---|
meta | The field name as defined in the job input |
answer | Typed value — see DataValue below |
rawValue | The raw extracted value as returned by the AI engine (untyped) |
explanation | Why the AI extracted this value |
referenceIds | Annotation IDs pointing to source passages |
The answer field is a DataValue object. Only one field within it is populated, depending on the answerType.dataType defined when the job was created:
{
"meta": "study_year",
"answer": { "longValue": 2023 },
"rawValue": 2023,
"explanation": "The document states 'published in 2023' on page 1."
}
When multiValued: true is set, the list variant of the corresponding field is populated instead (strListValue, longListValue, etc.).
Retrieving Inputs
Retrieve the original field definitions and file targets submitted to the job.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/extraction/jobs/${JOB_ID}/inputs" \
-H "Authorization: Bearer <your-pat>"
response = client.get(f"/extraction/jobs/{job.job_id}/inputs")
response.raise_for_status()
inputs = response.json()
print(f"Fields: {[m['meta'] for m in inputs['metas']]}")
resp <- base_req |>
req_url_path_append("extraction", "jobs", job$jobId, "inputs") |>
req_perform()
inputs <- resp_body_json(resp)
cat(sprintf("Fields: %s\n", paste(sapply(inputs$metas, `[[`, "meta"), collapse = ", ")))
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
job_id = "<your-job-id>"
inputs = await client.extraction.get_job_inputs(job_id)
print(f"Fields: {[m.meta for m in inputs.metas]}")
asyncio.run(main())
Retrieving Annotations
Each extraction result includes a referenceIds field identifying the exact document passages the AI used to extract each value. Use the annotations endpoint to resolve those IDs into full objects with text excerpts, document names, and precise positions.
- cURL
- SDK
curl -s "https://<api-domain>/api/core/extraction/jobs/${JOB_ID}/annotations" \
-H "Authorization: Bearer <your-pat>"
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
job_id = "<your-job-id>"
annotations = await client.extraction.get_job_annotations(job_id)
for ann in annotations:
if ann.data:
print(f"[{ann.data.document_name}] {ann.data.content[:80]}")
asyncio.run(main())
For the full annotation data model, position types, and lookup patterns, see the Annotations guide.
Listing Extraction Jobs
List all extraction jobs for the current user.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/extraction/jobs?page=0&pageSize=20" \
-H "Authorization: Bearer <your-pat>"
class JobPage(BaseModel):
data: list[Job]
total_pages: int = Field(alias="totalPages")
total_elements: int = Field(alias="totalElements")
has_next: bool = Field(alias="hasNext")
response = client.get("/extraction/jobs", params={"page": 0, "pageSize": 20})
response.raise_for_status()
page = JobPage.model_validate(response.json())
for j in page.data:
print(f"{j.job_id}: {j.status}")
resp <- base_req |>
req_url_path_append("extraction", "jobs") |>
req_url_query(page = 0, pageSize = 20) |>
req_perform()
page <- resp_body_json(resp)
for (j in page$data) {
cat(sprintf("%s: %s\n", j$jobId, j$status))
}
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
page = await client.extraction.list_jobs(page=0, page_size=20)
for job in page.data:
print(f"{job.job_id}: {job.status}")
asyncio.run(main())
Next Steps
- Annotations — understand the full annotation data model and position types
- Question Answering for natural-language questions instead of structured fields
- Collections to organise files before running jobs on them
- API Reference for complete endpoint documentation