169 lines
3.7 KiB
Markdown
169 lines
3.7 KiB
Markdown
# Scraping Patterns
|
|
|
|
## Basic Extraction
|
|
|
|
```typescript
|
|
const browser = await chromium.launch();
|
|
const page = await browser.newPage();
|
|
await page.goto('https://example.com/products');
|
|
|
|
const products = await page.$$eval('.product-card', cards =>
|
|
cards.map(card => ({
|
|
name: card.querySelector('.name')?.textContent?.trim(),
|
|
price: card.querySelector('.price')?.textContent?.trim(),
|
|
url: card.querySelector('a')?.href,
|
|
}))
|
|
);
|
|
|
|
await browser.close();
|
|
```
|
|
|
|
## Wait Strategies for SPAs
|
|
|
|
```typescript
|
|
// Wait for specific element
|
|
await page.waitForSelector('[data-loaded="true"]');
|
|
|
|
// Wait for network idle (careful with SPAs)
|
|
await page.goto(url, { waitUntil: 'networkidle' });
|
|
|
|
// Wait for loading indicator to disappear
|
|
await page.waitForSelector('.loading-spinner', { state: 'hidden' });
|
|
|
|
// Custom condition with polling
|
|
await expect.poll(async () => {
|
|
return await page.locator('.product').count();
|
|
}).toBeGreaterThan(0);
|
|
```
|
|
|
|
## Infinite Scroll
|
|
|
|
```typescript
|
|
async function scrollToBottom(page: Page) {
|
|
let previousHeight = 0;
|
|
|
|
while (true) {
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
if (currentHeight === previousHeight) break;
|
|
|
|
previousHeight = currentHeight;
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
await page.waitForTimeout(1000); // Allow content to load
|
|
}
|
|
}
|
|
```
|
|
|
|
## Pagination
|
|
|
|
```typescript
|
|
// Click-based pagination
|
|
async function scrapeAllPages(page: Page) {
|
|
const allData = [];
|
|
|
|
while (true) {
|
|
const pageData = await extractData(page);
|
|
allData.push(...pageData);
|
|
|
|
const nextButton = page.getByRole('button', { name: 'Next' });
|
|
if (await nextButton.isDisabled()) break;
|
|
|
|
await nextButton.click();
|
|
await page.waitForLoadState('networkidle');
|
|
}
|
|
|
|
return allData;
|
|
}
|
|
```
|
|
|
|
## Anti-Bot Evasion
|
|
|
|
```typescript
|
|
const browser = await chromium.launch({
|
|
headless: false, // Some sites detect headless
|
|
});
|
|
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
viewport: { width: 1920, height: 1080 },
|
|
locale: 'en-US',
|
|
timezoneId: 'America/New_York',
|
|
});
|
|
|
|
// Add realistic behavior
|
|
await page.mouse.move(100, 100);
|
|
await page.waitForTimeout(Math.random() * 2000 + 1000);
|
|
```
|
|
|
|
## Session Management
|
|
|
|
```typescript
|
|
// Save cookies
|
|
await context.storageState({ path: 'session.json' });
|
|
|
|
// Restore session
|
|
const context = await browser.newContext({
|
|
storageState: 'session.json',
|
|
});
|
|
```
|
|
|
|
## Error Handling
|
|
|
|
```typescript
|
|
async function scrapeWithRetry(url: string, retries = 3) {
|
|
for (let i = 0; i < retries; i++) {
|
|
try {
|
|
const page = await context.newPage();
|
|
await page.goto(url, { timeout: 30000 });
|
|
return await extractData(page);
|
|
} catch (error) {
|
|
if (i === retries - 1) throw error;
|
|
await new Promise(r => setTimeout(r, 2000 * (i + 1)));
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
## Rate Limiting
|
|
|
|
```typescript
|
|
class RateLimiter {
|
|
private lastRequest = 0;
|
|
|
|
constructor(private minDelay: number) {}
|
|
|
|
async wait() {
|
|
const elapsed = Date.now() - this.lastRequest;
|
|
if (elapsed < this.minDelay) {
|
|
await new Promise(r => setTimeout(r, this.minDelay - elapsed));
|
|
}
|
|
this.lastRequest = Date.now();
|
|
}
|
|
}
|
|
|
|
const limiter = new RateLimiter(2000); // 2s between requests
|
|
|
|
for (const url of urls) {
|
|
await limiter.wait();
|
|
await scrape(url);
|
|
}
|
|
```
|
|
|
|
## Proxy Rotation
|
|
|
|
```typescript
|
|
const proxies = ['proxy1:8080', 'proxy2:8080', 'proxy3:8080'];
|
|
let proxyIndex = 0;
|
|
|
|
async function getNextProxy() {
|
|
const proxy = proxies[proxyIndex];
|
|
proxyIndex = (proxyIndex + 1) % proxies.length;
|
|
return proxy;
|
|
}
|
|
|
|
const browser = await chromium.launch({
|
|
proxy: { server: await getNextProxy() },
|
|
});
|
|
```
|