Extract Document Data with JavaScript and Node.js

Whether you're building a backend service with Node.js or a serverless function on Vercel, extracting structured data from documents is a common need. This guide shows how to integrate document extraction into your JavaScript project — from a single file to batch processing pipelines.

Setup

Works with Node.js 18+ (native fetch) or any environment with fetch available.

const API_BASE = "https://api.smole.tech/api";
const API_KEY = "your_api_key_here"; // from https://smole.tech/account/api-keys

const headers = {
  Authorization: `Bearer ${API_KEY}`,
};

No external dependencies needed — just the built-in fetch and fs modules.

Step 1: Create a Schema

async function createSchema(name, schema) {
  const resp = await fetch(`${API_BASE}/schemas`, {
    method: "POST",
    headers: { ...headers, "Content-Type": "application/json" },
    body: JSON.stringify({ name, jsonSchema: schema }),
  });

  if (!resp.ok) throw new Error(`Schema creation failed: ${resp.status}`);
  return resp.json();
}

// Example: invoice schema
const schema = await createSchema("invoice", {
  type: "object",
  properties: {
    vendor_name: { type: "string" },
    invoice_number: { type: "string" },
    date: { type: "string", format: "date" },
    line_items: {
      type: "array",
      items: {
        type: "object",
        properties: {
          description: { type: "string" },
          quantity: { type: "number" },
          unit_price: { type: "number" },
          total: { type: "number" },
        },
      },
    },
    subtotal: { type: "number" },
    tax_amount: { type: "number" },
    total: { type: "number" },
  },
});

const schemaId = schema.id;

Step 2: Upload a Document

import fs from "node:fs";

async function uploadDocument(filePath, schemaId) {
  const formData = new FormData();
  formData.append("file", new Blob([fs.readFileSync(filePath)]));
  formData.append("schemaId", schemaId);

  const resp = await fetch(`${API_BASE}/pipeline/file`, {
    method: "POST",
    headers,
    body: formData,
  });

  if (!resp.ok) throw new Error(`Upload failed: ${resp.status}`);
  return resp.json();
}

Step 3: Poll for Results

function sleep(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

async function waitForResult(pipelineId, timeoutMs = 120_000) {
  const start = Date.now();

  while (Date.now() - start < timeoutMs) {
    const resp = await fetch(`${API_BASE}/pipeline/${pipelineId}`, { headers });
    if (!resp.ok) throw new Error(`Poll failed: ${resp.status}`);

    const data = await resp.json();

    if (data.status === "completed") return data;
    if (data.status === "failed") throw new Error(`Pipeline failed: ${data.error}`);

    await sleep(2000);
  }

  throw new Error(`Timed out after ${timeoutMs}ms`);
}

Complete Extraction Function

Putting it all together:

import fs from "node:fs";

async function extract(filePath, schemaId) {
  // Upload
  const formData = new FormData();
  formData.append("file", new Blob([fs.readFileSync(filePath)]));
  formData.append("schemaId", schemaId);

  const uploadResp = await fetch(`${API_BASE}/pipeline/file`, {
    method: "POST",
    headers,
    body: formData,
  });

  if (!uploadResp.ok) throw new Error(`Upload failed: ${uploadResp.status}`);
  const { id: pipelineId } = await uploadResp.json();

  // Poll
  const start = Date.now();
  while (Date.now() - start < 120_000) {
    const resp = await fetch(`${API_BASE}/pipeline/${pipelineId}`, { headers });
    const data = await resp.json();

    if (data.status === "completed") return data.extraction.data;
    if (data.status === "failed") throw new Error(`Failed: ${data.error}`);

    await sleep(2000);
  }

  throw new Error("Timed out");
}

// Usage
const result = await extract("invoice.pdf", schemaId);
console.log(result.vendor_name);  // "Acme Corp"
console.log(result.total);         // 1250.00
console.log(result.line_items);    // [{ description: "...", ... }]

Batch Processing

Process multiple files concurrently with controlled parallelism:

import fs from "node:fs";
import path from "node:path";

async function processBatch(directory, schemaId, concurrency = 5) {
  const files = fs.readdirSync(directory).filter((f) => f.endsWith(".pdf"));
  const results = [];

  // Process in chunks
  for (let i = 0; i < files.length; i += concurrency) {
    const chunk = files.slice(i, i + concurrency);
    const chunkResults = await Promise.allSettled(
      chunk.map(async (file) => {
        const filePath = path.join(directory, file);
        const data = await extract(filePath, schemaId);
        return { file, data };
      })
    );

    for (const result of chunkResults) {
      if (result.status === "fulfilled") {
        results.push(result.value);
        console.log(`  OK: ${result.value.file}`);
      } else {
        console.error(`  FAIL: ${result.reason.message}`);
      }
    }
  }

  return results;
}

const results = await processBatch("./invoices", schemaId);
fs.writeFileSync("results.json", JSON.stringify(results, null, 2));

Express.js Integration

Add document extraction to an Express API:

import express from "express";
import multer from "multer";

const app = express();
const upload = multer({ storage: multer.memoryStorage() });

app.post("/extract", upload.single("document"), async (req, res) => {
  try {
    const formData = new FormData();
    formData.append("file", new Blob([req.file.buffer]), req.file.originalname);
    formData.append("schemaId", req.body.schemaId);

    const uploadResp = await fetch(`${API_BASE}/pipeline/file`, {
      method: "POST",
      headers,
      body: formData,
    });

    const { id: pipelineId } = await uploadResp.json();
    const result = await waitForResult(pipelineId);

    res.json(result.extraction.data);
  } catch (err) {
    res.status(500).json({ error: err.message });
  }
});

Error Handling with Retries

async function extractWithRetries(filePath, schemaId, maxRetries = 3) {
  let lastError;

  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      return await extract(filePath, schemaId);
    } catch (err) {
      lastError = err;
      if (attempt < maxRetries) {
        const delay = 2 ** attempt * 1000;
        console.log(`  Retry ${attempt}/${maxRetries} in ${delay}ms...`);
        await sleep(delay);
      }
    }
  }

  throw lastError;
}

TypeScript Types

Add type safety to your extraction results:

interface InvoiceData {
  vendor_name: string;
  invoice_number: string;
  date: string;
  line_items: {
    description: string;
    quantity: number;
    unit_price: number;
    total: number;
  }[];
  subtotal: number;
  tax_amount: number;
  total: number;
}

const result = await extract("invoice.pdf", schemaId) as InvoiceData;
// Full type safety from here

Try It Now

Test extraction in the Playground before writing code — upload a document and see the JSON output. Then use the code above to integrate into your JavaScript project.

For the full API reference, see the documentation.

Extract Document Data with JavaScript and Node.js

Extract Document Data with JavaScript and Node.js

Setup

Step 1: Create a Schema

Step 2: Upload a Document

Step 3: Poll for Results

Complete Extraction Function

Batch Processing

Express.js Integration

Error Handling with Retries

TypeScript Types

Try It Now

Related articles

How to Convert PDFs to JSON with an API

Extract Structured Data from Documents with Python

Convert Word Documents (DOCX) to JSON via API