Skip to main content
This guide covers how to scrape many URLs efficiently with concurrency and progress tracking.

Basic Concurrent Scraping

import { ReaderClient } from "@vakra-dev/reader";

const reader = new ReaderClient();

const result = await reader.scrape({
  urls: [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3",
    "https://example.com/page4",
    "https://example.com/page5",
  ],
  batchConcurrency: 3, // Process 3 URLs at a time
});

console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);

await reader.close();

With Progress Tracking

const urls = Array.from({ length: 20 }, (_, i) =>
  `https://example.com/page${i + 1}`
);

const result = await reader.scrape({
  urls,
  batchConcurrency: 5,
  onProgress: (progress) => {
    const percent = Math.round((progress.completed / progress.total) * 100);
    console.log(`[${percent}%] ${progress.completed}/${progress.total}: ${progress.currentUrl}`);
  },
});

Configure Browser Pool for High Volume

For high-volume scraping, configure a larger browser pool:
const reader = new ReaderClient({
  browserPool: {
    size: 10,             // 10 browser instances
    retireAfterPages: 25, // Recycle frequently to prevent memory issues
    retireAfterMinutes: 10,
    maxQueueSize: 200,    // Larger queue for bursts
  },
});

const result = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 10, // Match pool size
});

Batch Timeout

Set a timeout for the entire batch operation:
const result = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 5,
  timeoutMs: 30000,       // 30s per page
  batchTimeoutMs: 600000, // 10 minutes for entire batch
});

Retry Failed URLs

Reader automatically retries failed URLs:
const result = await reader.scrape({
  urls: manyUrls,
  maxRetries: 3, // Retry up to 3 times (default: 2)
});

// Check for any remaining failures
if (result.batchMetadata.errors) {
  console.log("Failed URLs after retries:");
  result.batchMetadata.errors.forEach((e) => {
    console.log(`  ${e.url}: ${e.error}`);
  });
}

Handle Partial Failures

const result = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 5,
});

const { successfulUrls, failedUrls, errors } = result.batchMetadata;

console.log(`Success: ${successfulUrls}, Failed: ${failedUrls}`);

// Process successful results
result.data.forEach((page) => {
  // Save to database, process with LLM, etc.
  saveToDatabase(page.metadata.baseUrl, page.markdown);
});

// Handle failures
if (errors && errors.length > 0) {
  // Log for manual review or retry later
  errors.forEach((e) => {
    logFailure(e.url, e.error);
  });
}

Streaming Results

For very large batches, process results as they come:
const urls = getLargeUrlList(); // Thousands of URLs
const results: WebsiteScrapeResult[] = [];
let processed = 0;

const result = await reader.scrape({
  urls,
  batchConcurrency: 10,
  onProgress: async (progress) => {
    // Process each completed URL immediately
    // Note: results are available in order after scrape completes
    processed++;
    console.log(`Processed ${processed}/${urls.length}`);
  },
});

// All results are available here
result.data.forEach((page) => {
  processPage(page);
});

With Proxy Rotation

Distribute requests across multiple proxies:
const reader = new ReaderClient({
  proxies: [
    { host: "proxy1.example.com", port: 8080, username: "user", password: "pass" },
    { host: "proxy2.example.com", port: 8080, username: "user", password: "pass" },
    { host: "proxy3.example.com", port: 8080, username: "user", password: "pass" },
  ],
  proxyRotation: "round-robin",
  browserPool: { size: 10 },
});

const result = await reader.scrape({
  urls: manyUrls,
  batchConcurrency: 10,
});

Complete Example

import { ReaderClient } from "@vakra-dev/reader";

async function batchScrape(urls: string[]) {
  const reader = new ReaderClient({
    verbose: true,
    browserPool: {
      size: 5,
      retireAfterPages: 50,
    },
  });

  try {
    console.log(`Starting batch scrape of ${urls.length} URLs...`);
    const startTime = Date.now();

    const result = await reader.scrape({
      urls,
      formats: ["markdown"],
      batchConcurrency: 5,
      timeoutMs: 30000,
      batchTimeoutMs: 300000,
      maxRetries: 2,
      onProgress: (p) => {
        const elapsed = Math.round((Date.now() - startTime) / 1000);
        const rate = p.completed / elapsed || 0;
        const eta = rate > 0 ? Math.round((p.total - p.completed) / rate) : 0;
        console.log(
          `[${p.completed}/${p.total}] ${p.currentUrl} (${rate.toFixed(1)}/s, ETA: ${eta}s)`
        );
      },
    });

    const duration = Date.now() - startTime;
    console.log(`\n--- Batch Complete ---`);
    console.log(`Total: ${urls.length}`);
    console.log(`Success: ${result.batchMetadata.successfulUrls}`);
    console.log(`Failed: ${result.batchMetadata.failedUrls}`);
    console.log(`Duration: ${(duration / 1000).toFixed(1)}s`);
    console.log(`Rate: ${(urls.length / (duration / 1000)).toFixed(1)} URLs/s`);

    return result;
  } finally {
    await reader.close();
  }
}

// Usage
const urls = [
  "https://news.ycombinator.com",
  "https://lobste.rs",
  "https://reddit.com/r/programming",
  // ... more URLs
];

batchScrape(urls).catch(console.error);

Next Steps