EU-Utility/skills/playwright/scraping.md

3.7 KiB

Scraping Patterns

Basic Extraction

const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://example.com/products');

const products = await page.$$eval('.product-card', cards =>
  cards.map(card => ({
    name: card.querySelector('.name')?.textContent?.trim(),
    price: card.querySelector('.price')?.textContent?.trim(),
    url: card.querySelector('a')?.href,
  }))
);

await browser.close();

Wait Strategies for SPAs

// Wait for specific element
await page.waitForSelector('[data-loaded="true"]');

// Wait for network idle (careful with SPAs)
await page.goto(url, { waitUntil: 'networkidle' });

// Wait for loading indicator to disappear
await page.waitForSelector('.loading-spinner', { state: 'hidden' });

// Custom condition with polling
await expect.poll(async () => {
  return await page.locator('.product').count();
}).toBeGreaterThan(0);

Infinite Scroll

async function scrollToBottom(page: Page) {
  let previousHeight = 0;
  
  while (true) {
    const currentHeight = await page.evaluate(() => document.body.scrollHeight);
    if (currentHeight === previousHeight) break;
    
    previousHeight = currentHeight;
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
    await page.waitForTimeout(1000);  // Allow content to load
  }
}

Pagination

// Click-based pagination
async function scrapeAllPages(page: Page) {
  const allData = [];
  
  while (true) {
    const pageData = await extractData(page);
    allData.push(...pageData);
    
    const nextButton = page.getByRole('button', { name: 'Next' });
    if (await nextButton.isDisabled()) break;
    
    await nextButton.click();
    await page.waitForLoadState('networkidle');
  }
  
  return allData;
}

Anti-Bot Evasion

const browser = await chromium.launch({
  headless: false,  // Some sites detect headless
});

const context = await browser.newContext({
  userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
  viewport: { width: 1920, height: 1080 },
  locale: 'en-US',
  timezoneId: 'America/New_York',
});

// Add realistic behavior
await page.mouse.move(100, 100);
await page.waitForTimeout(Math.random() * 2000 + 1000);

Session Management

// Save cookies
await context.storageState({ path: 'session.json' });

// Restore session
const context = await browser.newContext({
  storageState: 'session.json',
});

Error Handling

async function scrapeWithRetry(url: string, retries = 3) {
  for (let i = 0; i < retries; i++) {
    try {
      const page = await context.newPage();
      await page.goto(url, { timeout: 30000 });
      return await extractData(page);
    } catch (error) {
      if (i === retries - 1) throw error;
      await new Promise(r => setTimeout(r, 2000 * (i + 1)));
    } finally {
      await page.close();
    }
  }
}

Rate Limiting

class RateLimiter {
  private lastRequest = 0;
  
  constructor(private minDelay: number) {}
  
  async wait() {
    const elapsed = Date.now() - this.lastRequest;
    if (elapsed < this.minDelay) {
      await new Promise(r => setTimeout(r, this.minDelay - elapsed));
    }
    this.lastRequest = Date.now();
  }
}

const limiter = new RateLimiter(2000);  // 2s between requests

for (const url of urls) {
  await limiter.wait();
  await scrape(url);
}

Proxy Rotation

const proxies = ['proxy1:8080', 'proxy2:8080', 'proxy3:8080'];
let proxyIndex = 0;

async function getNextProxy() {
  const proxy = proxies[proxyIndex];
  proxyIndex = (proxyIndex + 1) % proxies.length;
  return proxy;
}

const browser = await chromium.launch({
  proxy: { server: await getNextProxy() },
});