Basic Concurrent Scraping
Copy
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.scrape({
urls: [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
"https://example.com/page4",
"https://example.com/page5",
],
batchConcurrency: 3, // Process 3 URLs at a time
});
console.log(`Scraped ${result.batchMetadata.successfulUrls} URLs`);
await reader.close();
With Progress Tracking
Copy
const urls = Array.from({ length: 20 }, (_, i) =>
`https://example.com/page${i + 1}`
);
const result = await reader.scrape({
urls,
batchConcurrency: 5,
onProgress: (progress) => {
const percent = Math.round((progress.completed / progress.total) * 100);
console.log(`[${percent}%] ${progress.completed}/${progress.total}: ${progress.currentUrl}`);
},
});
Configure Browser Pool for High Volume
For high-volume scraping, configure a larger browser pool:Copy
const reader = new ReaderClient({
browserPool: {
size: 10, // 10 browser instances
retireAfterPages: 25, // Recycle frequently to prevent memory issues
retireAfterMinutes: 10,
maxQueueSize: 200, // Larger queue for bursts
},
});
const result = await reader.scrape({
urls: manyUrls,
batchConcurrency: 10, // Match pool size
});
Batch Timeout
Set a timeout for the entire batch operation:Copy
const result = await reader.scrape({
urls: manyUrls,
batchConcurrency: 5,
timeoutMs: 30000, // 30s per page
batchTimeoutMs: 600000, // 10 minutes for entire batch
});
Retry Failed URLs
Reader automatically retries failed URLs:Copy
const result = await reader.scrape({
urls: manyUrls,
maxRetries: 3, // Retry up to 3 times (default: 2)
});
// Check for any remaining failures
if (result.batchMetadata.errors) {
console.log("Failed URLs after retries:");
result.batchMetadata.errors.forEach((e) => {
console.log(` ${e.url}: ${e.error}`);
});
}
Handle Partial Failures
Copy
const result = await reader.scrape({
urls: manyUrls,
batchConcurrency: 5,
});
const { successfulUrls, failedUrls, errors } = result.batchMetadata;
console.log(`Success: ${successfulUrls}, Failed: ${failedUrls}`);
// Process successful results
result.data.forEach((page) => {
// Save to database, process with LLM, etc.
saveToDatabase(page.metadata.baseUrl, page.markdown);
});
// Handle failures
if (errors && errors.length > 0) {
// Log for manual review or retry later
errors.forEach((e) => {
logFailure(e.url, e.error);
});
}
Streaming Results
For very large batches, process results as they come:Copy
const urls = getLargeUrlList(); // Thousands of URLs
const results: WebsiteScrapeResult[] = [];
let processed = 0;
const result = await reader.scrape({
urls,
batchConcurrency: 10,
onProgress: async (progress) => {
// Process each completed URL immediately
// Note: results are available in order after scrape completes
processed++;
console.log(`Processed ${processed}/${urls.length}`);
},
});
// All results are available here
result.data.forEach((page) => {
processPage(page);
});
With Proxy Rotation
Distribute requests across multiple proxies:Copy
const reader = new ReaderClient({
proxies: [
{ host: "proxy1.example.com", port: 8080, username: "user", password: "pass" },
{ host: "proxy2.example.com", port: 8080, username: "user", password: "pass" },
{ host: "proxy3.example.com", port: 8080, username: "user", password: "pass" },
],
proxyRotation: "round-robin",
browserPool: { size: 10 },
});
const result = await reader.scrape({
urls: manyUrls,
batchConcurrency: 10,
});
Complete Example
Copy
import { ReaderClient } from "@vakra-dev/reader";
async function batchScrape(urls: string[]) {
const reader = new ReaderClient({
verbose: true,
browserPool: {
size: 5,
retireAfterPages: 50,
},
});
try {
console.log(`Starting batch scrape of ${urls.length} URLs...`);
const startTime = Date.now();
const result = await reader.scrape({
urls,
formats: ["markdown"],
batchConcurrency: 5,
timeoutMs: 30000,
batchTimeoutMs: 300000,
maxRetries: 2,
onProgress: (p) => {
const elapsed = Math.round((Date.now() - startTime) / 1000);
const rate = p.completed / elapsed || 0;
const eta = rate > 0 ? Math.round((p.total - p.completed) / rate) : 0;
console.log(
`[${p.completed}/${p.total}] ${p.currentUrl} (${rate.toFixed(1)}/s, ETA: ${eta}s)`
);
},
});
const duration = Date.now() - startTime;
console.log(`\n--- Batch Complete ---`);
console.log(`Total: ${urls.length}`);
console.log(`Success: ${result.batchMetadata.successfulUrls}`);
console.log(`Failed: ${result.batchMetadata.failedUrls}`);
console.log(`Duration: ${(duration / 1000).toFixed(1)}s`);
console.log(`Rate: ${(urls.length / (duration / 1000)).toFixed(1)} URLs/s`);
return result;
} finally {
await reader.close();
}
}
// Usage
const urls = [
"https://news.ycombinator.com",
"https://lobste.rs",
"https://reddit.com/r/programming",
// ... more URLs
];
batchScrape(urls).catch(console.error);

