Extract Document Data with JavaScript and Node.js
Extract Document Data with JavaScript and Node.js
Whether you're building a backend service with Node.js or a serverless function on Vercel, extracting structured data from documents is a common need. This guide shows how to integrate document extraction into your JavaScript project — from a single file to batch processing pipelines.
Setup
Works with Node.js 18+ (native fetch) or any environment with fetch available.
const API_BASE = "https://api.smole.tech/api";
const API_KEY = "your_api_key_here"; // from https://smole.tech/account/api-keys
const headers = {
Authorization: `Bearer ${API_KEY}`,
};
No external dependencies needed — just the built-in fetch and fs modules.
Step 1: Create a Schema
async function createSchema(name, schema) {
const resp = await fetch(`${API_BASE}/schemas`, {
method: "POST",
headers: { ...headers, "Content-Type": "application/json" },
body: JSON.stringify({ name, jsonSchema: schema }),
});
if (!resp.ok) throw new Error(`Schema creation failed: ${resp.status}`);
return resp.json();
}
// Example: invoice schema
const schema = await createSchema("invoice", {
type: "object",
properties: {
vendor_name: { type: "string" },
invoice_number: { type: "string" },
date: { type: "string", format: "date" },
line_items: {
type: "array",
items: {
type: "object",
properties: {
description: { type: "string" },
quantity: { type: "number" },
unit_price: { type: "number" },
total: { type: "number" },
},
},
},
subtotal: { type: "number" },
tax_amount: { type: "number" },
total: { type: "number" },
},
});
const schemaId = schema.id;
Step 2: Upload a Document
import fs from "node:fs";
async function uploadDocument(filePath, schemaId) {
const formData = new FormData();
formData.append("file", new Blob([fs.readFileSync(filePath)]));
formData.append("schemaId", schemaId);
const resp = await fetch(`${API_BASE}/pipeline/file`, {
method: "POST",
headers,
body: formData,
});
if (!resp.ok) throw new Error(`Upload failed: ${resp.status}`);
return resp.json();
}
Step 3: Poll for Results
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function waitForResult(pipelineId, timeoutMs = 120_000) {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
const resp = await fetch(`${API_BASE}/pipeline/${pipelineId}`, { headers });
if (!resp.ok) throw new Error(`Poll failed: ${resp.status}`);
const data = await resp.json();
if (data.status === "completed") return data;
if (data.status === "failed") throw new Error(`Pipeline failed: ${data.error}`);
await sleep(2000);
}
throw new Error(`Timed out after ${timeoutMs}ms`);
}
Complete Extraction Function
Putting it all together:
import fs from "node:fs";
async function extract(filePath, schemaId) {
// Upload
const formData = new FormData();
formData.append("file", new Blob([fs.readFileSync(filePath)]));
formData.append("schemaId", schemaId);
const uploadResp = await fetch(`${API_BASE}/pipeline/file`, {
method: "POST",
headers,
body: formData,
});
if (!uploadResp.ok) throw new Error(`Upload failed: ${uploadResp.status}`);
const { id: pipelineId } = await uploadResp.json();
// Poll
const start = Date.now();
while (Date.now() - start < 120_000) {
const resp = await fetch(`${API_BASE}/pipeline/${pipelineId}`, { headers });
const data = await resp.json();
if (data.status === "completed") return data.extraction.data;
if (data.status === "failed") throw new Error(`Failed: ${data.error}`);
await sleep(2000);
}
throw new Error("Timed out");
}
// Usage
const result = await extract("invoice.pdf", schemaId);
console.log(result.vendor_name); // "Acme Corp"
console.log(result.total); // 1250.00
console.log(result.line_items); // [{ description: "...", ... }]
Batch Processing
Process multiple files concurrently with controlled parallelism:
import fs from "node:fs";
import path from "node:path";
async function processBatch(directory, schemaId, concurrency = 5) {
const files = fs.readdirSync(directory).filter((f) => f.endsWith(".pdf"));
const results = [];
// Process in chunks
for (let i = 0; i < files.length; i += concurrency) {
const chunk = files.slice(i, i + concurrency);
const chunkResults = await Promise.allSettled(
chunk.map(async (file) => {
const filePath = path.join(directory, file);
const data = await extract(filePath, schemaId);
return { file, data };
})
);
for (const result of chunkResults) {
if (result.status === "fulfilled") {
results.push(result.value);
console.log(` OK: ${result.value.file}`);
} else {
console.error(` FAIL: ${result.reason.message}`);
}
}
}
return results;
}
const results = await processBatch("./invoices", schemaId);
fs.writeFileSync("results.json", JSON.stringify(results, null, 2));
Express.js Integration
Add document extraction to an Express API:
import express from "express";
import multer from "multer";
const app = express();
const upload = multer({ storage: multer.memoryStorage() });
app.post("/extract", upload.single("document"), async (req, res) => {
try {
const formData = new FormData();
formData.append("file", new Blob([req.file.buffer]), req.file.originalname);
formData.append("schemaId", req.body.schemaId);
const uploadResp = await fetch(`${API_BASE}/pipeline/file`, {
method: "POST",
headers,
body: formData,
});
const { id: pipelineId } = await uploadResp.json();
const result = await waitForResult(pipelineId);
res.json(result.extraction.data);
} catch (err) {
res.status(500).json({ error: err.message });
}
});
Error Handling with Retries
async function extractWithRetries(filePath, schemaId, maxRetries = 3) {
let lastError;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
return await extract(filePath, schemaId);
} catch (err) {
lastError = err;
if (attempt < maxRetries) {
const delay = 2 ** attempt * 1000;
console.log(` Retry ${attempt}/${maxRetries} in ${delay}ms...`);
await sleep(delay);
}
}
}
throw lastError;
}
TypeScript Types
Add type safety to your extraction results:
interface InvoiceData {
vendor_name: string;
invoice_number: string;
date: string;
line_items: {
description: string;
quantity: number;
unit_price: number;
total: number;
}[];
subtotal: number;
tax_amount: number;
total: number;
}
const result = await extract("invoice.pdf", schemaId) as InvoiceData;
// Full type safety from here
Try It Now
Test extraction in the Playground before writing code — upload a document and see the JSON output. Then use the code above to integrate into your JavaScript project.
For the full API reference, see the documentation.
Related articles
How to Convert PDFs to JSON with an API
A practical guide to converting PDF documents into structured JSON data using a REST API. Covers digital PDFs, scanned documents, and batch processing.
pythonExtract Structured Data from Documents with Python
How to extract structured JSON data from PDFs, scanned documents, and Word files using Python. Complete code examples with requests, error handling, and batch processing.
docxConvert Word Documents (DOCX) to JSON via API
How to extract structured JSON data from Word documents using a REST API. Convert DOCX files to structured data for contracts, reports, and forms.
