File Indexation
This guide covers creating EliseFiles from provider items, understanding the indexation lifecycle, and working with AI-extracted ontologies. Make sure you have files in a provider first -- see Managing Provider Items.
The Indexation Flow
Creating an EliseFile is the step that transforms a raw file in storage into an intelligent, searchable document. The flow looks like this:
- You create an EliseFile by referencing a provider item (provider ID, path, name)
- The platform indexes the file asynchronously (parses content, extracts structure)
- Ontologies are automatically computed (AI-extracted concepts and metadata)
Creating an EliseFile
Create an EliseFile from a file that already exists in a provider. The request references the source provider and file location.
- cURL
- Python (httpx)
- R
- SDK
curl -X POST "https://<api-domain>/api/core/files" \
-H "Authorization: Bearer <your-pat>" \
-H "Content-Type: application/json" \
-d '{
"providerId": "550e8400-e29b-41d4-a716-446655440000",
"key": "reports/research-paper.pdf"
}'
from pydantic import BaseModel, Field
class FileId(BaseModel):
id: str
entity_type: str = Field(alias="entityType")
class IndexationInfo(BaseModel):
status: str | None = None
error_message: str | None = Field(default=None, alias="errorMessage")
class EliseFileInfo(BaseModel):
id: FileId
name: str
key: str
extension: str | None = None
media_type: str | None = Field(default=None, alias="mediaType")
size: int | None = None
indexed: bool
match_computed: bool = Field(alias="matchComputed")
last_indexation_infos: IndexationInfo | None = Field(
default=None, alias="lastIndexationInfos"
)
title: str | None = None
authors: str | None = None
description: str | None = None
display_name: str | None = Field(default=None, alias="displayName")
provider_name: str | None = Field(default=None, alias="providerName")
type: str | None = None
response = client.post(
"/files",
json={
"providerId": "550e8400-e29b-41d4-a716-446655440000",
"key": "reports/research-paper.pdf",
},
)
response.raise_for_status()
file_info = EliseFileInfo.model_validate(response.json())
print(f"Created EliseFile: {file_info.id.id}")
print(f" Key: {file_info.key}")
print(f" Indexed: {file_info.indexed}")
resp <- base_req |>
req_url_path_append("files") |>
req_body_json(list(
providerId = "550e8400-e29b-41d4-a716-446655440000",
key = "reports/research-paper.pdf"
)) |>
req_perform()
file_info <- resp_body_json(resp)
cat(sprintf("Created EliseFile: %s\n", file_info$id$id))
cat(sprintf(" Key: %s\n", file_info$key))
cat(sprintf(" Indexed: %s\n", file_info$indexed))
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
file = await client.files.create(
provider_id="550e8400-e29b-41d4-a716-446655440000",
key="reports/research-paper.pdf",
)
print(f"Created EliseFile: {file.id.id}")
print(f" Key: {file.key}")
print(f" Indexed: {file.indexed}")
asyncio.run(main())
The file must have one of the supported extensions: pdf, txt, html, htm, doc, docx, csv, xlsx, xls, ods, jpg, jpeg, png, webp, bmp, tiff, json, xml. Unsupported extensions will return a 400 error.
Getting File Information
Retrieve the current state of an EliseFile, including its indexation status.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/files/${FILE_ID}" \
-H "Authorization: Bearer <your-pat>"
file_id = file_info.id.id
response = client.get(f"/files/{file_id}")
response.raise_for_status()
info = EliseFileInfo.model_validate(response.json())
print(f"File: {info.name}")
print(f"Indexed: {info.indexed}")
if info.last_indexation_infos:
print(f"Indexation status: {info.last_indexation_infos.status}")
file_id <- file_info$id$id
resp <- base_req |>
req_url_path_append("files", file_id) |>
req_perform()
info <- resp_body_json(resp)
cat(sprintf("File: %s\n", info$name))
cat(sprintf("Indexed: %s\n", info$indexed))
if (!is.null(info$lastIndexationInfos)) {
cat(sprintf("Indexation status: %s\n", info$lastIndexationInfos$status))
}
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
file_id = "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
info = await client.files.get(file_id)
print(f"File: {info.name}")
print(f"Indexed: {info.indexed}")
if info.last_indexation_infos:
print(f"Indexation status: {info.last_indexation_infos.status}")
asyncio.run(main())
Indexation Status
The lastIndexationInfos.status field tracks the indexation progress:
| Status | Description |
|---|---|
PENDING | Indexation has been queued but not started |
RUNNING | Indexation is in progress |
SUCCESS | Indexation completed successfully |
FAILED | Indexation encountered an error |
ABORTED | Indexation was cancelled |
When the status is FAILED, the errorMessage field provides details about what went wrong.
Retrieving Ontologies
Ontologies are AI-extracted metadata and concepts computed automatically during indexation. They represent structured knowledge extracted from the file content: topics, entities, categories, and other semantic information.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/files/${FILE_ID}/ontologies" \
-H "Authorization: Bearer <your-pat>"
from pydantic import BaseModel, Field
class OntologyMeta(BaseModel):
explanation: str | None = None
meta_name: str | None = Field(default=None, alias="metaName")
meta_value: object | None = Field(default=None, alias="metaValue")
class ConceptId(BaseModel):
id: str
entity_type: str = Field(alias="entityType")
class EliseOntology(BaseModel):
concept_id: ConceptId = Field(alias="conceptId")
name: str
metas: dict[str, OntologyMeta] | None = None
response = client.get(f"/files/{file_id}/ontologies")
response.raise_for_status()
ontologies = [EliseOntology.model_validate(o) for o in response.json()]
for ontology in ontologies:
print(f"Concept: {ontology.name}")
if ontology.metas:
for key, meta in ontology.metas.items():
print(f" {key}: {meta.meta_value}")
if meta.explanation:
print(f" Explanation: {meta.explanation}")
resp <- base_req |>
req_url_path_append("files", file_id, "ontologies") |>
req_perform()
ontologies <- resp_body_json(resp)
for (ontology in ontologies) {
cat(sprintf("Concept: %s\n", ontology$name))
if (!is.null(ontology$metas)) {
for (key in names(ontology$metas)) {
meta <- ontology$metas[[key]]
cat(sprintf(" %s: %s\n", key, as.character(meta$metaValue)))
}
}
}
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
file_id = "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
ontologies = await client.files.get_ontologies(file_id)
for ontology in ontologies:
print(f"Concept: {ontology.name}")
if ontology.metas:
for key, meta in ontology.metas.items():
print(f" {key}: {meta.meta_value}")
asyncio.run(main())
Reindexing a File
Force reindexation of an already indexed file. This is useful when you want to re-process the document, for example after platform updates that improve parsing quality.
- cURL
- Python (httpx)
- R
- SDK
curl -X POST "https://<api-domain>/api/core/files/${FILE_ID}/reindex" \
-H "Authorization: Bearer <your-pat>"
response = client.post(f"/files/{file_id}/reindex")
response.raise_for_status()
print("Reindexation started (202 Accepted)")
base_req |>
req_url_path_append("files", file_id, "reindex") |>
req_method("POST") |>
req_perform()
cat("Reindexation started (202 Accepted)\n")
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
file_id = "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
await client.files.reindex(file_id)
print("Reindexation started")
asyncio.run(main())
Reindexation runs asynchronously. The endpoint returns 202 Accepted immediately. Poll the file info endpoint to monitor progress via lastIndexationInfos.status.
Recomputing Ontologies
Force recomputation of AI-extracted ontologies and metadata without reindexing the full file. This is useful when ontology models have been updated.
- cURL
- Python (httpx)
- R
- SDK
curl -X POST "https://<api-domain>/api/core/files/${FILE_ID}/recompute-ontologies" \
-H "Authorization: Bearer <your-pat>"
response = client.post(f"/files/{file_id}/recompute-ontologies")
response.raise_for_status()
print("Ontology recomputation started (202 Accepted)")
base_req |>
req_url_path_append("files", file_id, "recompute-ontologies") |>
req_method("POST") |>
req_perform()
cat("Ontology recomputation started (202 Accepted)\n")
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
file_id = "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
await client.files.recompute_ontologies(file_id)
print("Ontology recomputation started")
asyncio.run(main())
Deleting a File
Delete an EliseFile. This removes the indexed document from the Elise system. The original file in the provider is not affected.
- cURL
- Python (httpx)
- R
- SDK
curl -X DELETE "https://<api-domain>/api/core/files/${FILE_ID}" \
-H "Authorization: Bearer <your-pat>"
response = client.delete(f"/files/{file_id}")
response.raise_for_status()
print("File deleted (204 No Content)")
base_req |>
req_url_path_append("files", file_id) |>
req_method("DELETE") |>
req_perform()
cat("File deleted (204 No Content)\n")
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
file_id = "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
await client.files.delete(file_id)
print("File deleted")
asyncio.run(main())
Deleting an EliseFile removes it from the Elise index only. The underlying file in the storage provider remains untouched. To also remove the file from storage, use the Delete Item endpoint on the provider.
Listing Indexed Files
List all EliseFiles registered for a given provider, with pagination and optional sorting. This returns only files that have already been created as EliseFiles — not raw provider items.
- cURL
- Python (httpx)
- R
- SDK
curl -s "https://<api-domain>/api/core/files?providerId=${PROVIDER_ID}&page=0&pageSize=20" \
-H "Authorization: Bearer <your-pat>"
from pydantic import BaseModel, Field
class FilesPage(BaseModel):
data: list[EliseFileInfo]
total_pages: int = Field(alias="totalPages")
total_elements: int = Field(alias="totalElements")
has_next: bool = Field(alias="hasNext")
response = client.get(
"/files",
params={
"providerId": "550e8400-e29b-41d4-a716-446655440000",
"page": 0,
"pageSize": 20,
"sortProperty": "name",
"sortOrder": "ASC",
},
)
response.raise_for_status()
page = FilesPage.model_validate(response.json())
print(f"Total indexed files: {page.total_elements}")
for file_info in page.data:
print(f" {file_info.name} (indexed: {file_info.indexed})")
resp <- base_req |>
req_url_path_append("files") |>
req_url_query(
providerId = "550e8400-e29b-41d4-a716-446655440000",
page = 0,
pageSize = 20,
sortProperty = "name",
sortOrder = "ASC"
) |>
req_perform()
page <- resp_body_json(resp)
cat(sprintf("Total indexed files: %d\n", page$totalElements))
for (file_info in page$data) {
cat(sprintf(" %s (indexed: %s)\n", file_info$name, file_info$indexed))
}
import asyncio
from biolevate import BiolevateClient
async def main():
async with BiolevateClient(
base_url="https://<api-domain>",
token="<your-pat>",
) as client:
provider_id = "550e8400-e29b-41d4-a716-446655440000"
page = await client.files.list(
provider_id=provider_id,
page=0,
page_size=20,
)
print(f"Total indexed files: {page.total_elements}")
for file_info in page.data:
print(f" {file_info.name} (indexed: {file_info.indexed})")
asyncio.run(main())
Query Parameters
| Parameter | Required | Description |
|---|---|---|
providerId | Yes | UUID of the provider to list files for |
page | Yes | Zero-based page number |
pageSize | Yes | Number of results per page |
sortProperty | No | Field to sort by (e.g. name, path) |
sortOrder | No | ASC or DESC |
This endpoint lists files that have been created as EliseFiles via POST /files. Raw files that exist in the provider but have not been indexed do not appear here. To browse raw provider content, use the Provider Items endpoints.
Next Steps
Now that you can index files and extract metadata:
- Organize into collections to group your indexed documents
- Common Patterns for pagination and error handling
See the Files endpoints in the API Reference for complete request/response schemas.