Back to blog
javascriptnodejstutorialapi

Extract Document Data with JavaScript and Node.js

February 16, 2026Smole Team

Extract Document Data with JavaScript and Node.js

Whether you're building a backend service with Node.js or a serverless function on Vercel, extracting structured data from documents is a common need. This guide shows how to integrate document extraction into your JavaScript project — from a single file to batch processing pipelines.

Setup

Works with Node.js 18+ (native fetch) or any environment with fetch available.

const API_BASE = "https://api.smole.tech/api";
const API_KEY = "your_api_key_here"; // from https://smole.tech/account/api-keys

const headers = {
  Authorization: `Bearer ${API_KEY}`,
};

No external dependencies needed — just the built-in fetch and fs modules.

Step 1: Create a Schema

async function createSchema(name, schema) {
  const resp = await fetch(`${API_BASE}/schemas`, {
    method: "POST",
    headers: { ...headers, "Content-Type": "application/json" },
    body: JSON.stringify({ name, jsonSchema: schema }),
  });

  if (!resp.ok) throw new Error(`Schema creation failed: ${resp.status}`);
  return resp.json();
}

// Example: invoice schema
const schema = await createSchema("invoice", {
  type: "object",
  properties: {
    vendor_name: { type: "string" },
    invoice_number: { type: "string" },
    date: { type: "string", format: "date" },
    line_items: {
      type: "array",
      items: {
        type: "object",
        properties: {
          description: { type: "string" },
          quantity: { type: "number" },
          unit_price: { type: "number" },
          total: { type: "number" },
        },
      },
    },
    subtotal: { type: "number" },
    tax_amount: { type: "number" },
    total: { type: "number" },
  },
});

const schemaId = schema.id;

Step 2: Upload a Document

import fs from "node:fs";

async function uploadDocument(filePath, schemaId) {
  const formData = new FormData();
  formData.append("file", new Blob([fs.readFileSync(filePath)]));
  formData.append("schemaId", schemaId);

  const resp = await fetch(`${API_BASE}/pipeline/file`, {
    method: "POST",
    headers,
    body: formData,
  });

  if (!resp.ok) throw new Error(`Upload failed: ${resp.status}`);
  return resp.json();
}

Step 3: Poll for Results

function sleep(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

async function waitForResult(pipelineId, timeoutMs = 120_000) {
  const start = Date.now();

  while (Date.now() - start < timeoutMs) {
    const resp = await fetch(`${API_BASE}/pipeline/${pipelineId}`, { headers });
    if (!resp.ok) throw new Error(`Poll failed: ${resp.status}`);

    const data = await resp.json();

    if (data.status === "completed") return data;
    if (data.status === "failed") throw new Error(`Pipeline failed: ${data.error}`);

    await sleep(2000);
  }

  throw new Error(`Timed out after ${timeoutMs}ms`);
}

Complete Extraction Function

Putting it all together:

import fs from "node:fs";

async function extract(filePath, schemaId) {
  // Upload
  const formData = new FormData();
  formData.append("file", new Blob([fs.readFileSync(filePath)]));
  formData.append("schemaId", schemaId);

  const uploadResp = await fetch(`${API_BASE}/pipeline/file`, {
    method: "POST",
    headers,
    body: formData,
  });

  if (!uploadResp.ok) throw new Error(`Upload failed: ${uploadResp.status}`);
  const { id: pipelineId } = await uploadResp.json();

  // Poll
  const start = Date.now();
  while (Date.now() - start < 120_000) {
    const resp = await fetch(`${API_BASE}/pipeline/${pipelineId}`, { headers });
    const data = await resp.json();

    if (data.status === "completed") return data.extraction.data;
    if (data.status === "failed") throw new Error(`Failed: ${data.error}`);

    await sleep(2000);
  }

  throw new Error("Timed out");
}

// Usage
const result = await extract("invoice.pdf", schemaId);
console.log(result.vendor_name);  // "Acme Corp"
console.log(result.total);         // 1250.00
console.log(result.line_items);    // [{ description: "...", ... }]

Batch Processing

Process multiple files concurrently with controlled parallelism:

import fs from "node:fs";
import path from "node:path";

async function processBatch(directory, schemaId, concurrency = 5) {
  const files = fs.readdirSync(directory).filter((f) => f.endsWith(".pdf"));
  const results = [];

  // Process in chunks
  for (let i = 0; i < files.length; i += concurrency) {
    const chunk = files.slice(i, i + concurrency);
    const chunkResults = await Promise.allSettled(
      chunk.map(async (file) => {
        const filePath = path.join(directory, file);
        const data = await extract(filePath, schemaId);
        return { file, data };
      })
    );

    for (const result of chunkResults) {
      if (result.status === "fulfilled") {
        results.push(result.value);
        console.log(`  OK: ${result.value.file}`);
      } else {
        console.error(`  FAIL: ${result.reason.message}`);
      }
    }
  }

  return results;
}

const results = await processBatch("./invoices", schemaId);
fs.writeFileSync("results.json", JSON.stringify(results, null, 2));

Express.js Integration

Add document extraction to an Express API:

import express from "express";
import multer from "multer";

const app = express();
const upload = multer({ storage: multer.memoryStorage() });

app.post("/extract", upload.single("document"), async (req, res) => {
  try {
    const formData = new FormData();
    formData.append("file", new Blob([req.file.buffer]), req.file.originalname);
    formData.append("schemaId", req.body.schemaId);

    const uploadResp = await fetch(`${API_BASE}/pipeline/file`, {
      method: "POST",
      headers,
      body: formData,
    });

    const { id: pipelineId } = await uploadResp.json();
    const result = await waitForResult(pipelineId);

    res.json(result.extraction.data);
  } catch (err) {
    res.status(500).json({ error: err.message });
  }
});

Error Handling with Retries

async function extractWithRetries(filePath, schemaId, maxRetries = 3) {
  let lastError;

  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      return await extract(filePath, schemaId);
    } catch (err) {
      lastError = err;
      if (attempt < maxRetries) {
        const delay = 2 ** attempt * 1000;
        console.log(`  Retry ${attempt}/${maxRetries} in ${delay}ms...`);
        await sleep(delay);
      }
    }
  }

  throw lastError;
}

TypeScript Types

Add type safety to your extraction results:

interface InvoiceData {
  vendor_name: string;
  invoice_number: string;
  date: string;
  line_items: {
    description: string;
    quantity: number;
    unit_price: number;
    total: number;
  }[];
  subtotal: number;
  tax_amount: number;
  total: number;
}

const result = await extract("invoice.pdf", schemaId) as InvoiceData;
// Full type safety from here

Try It Now

Test extraction in the Playground before writing code — upload a document and see the JSON output. Then use the code above to integrate into your JavaScript project.

For the full API reference, see the documentation.