-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
84 lines (70 loc) · 2.43 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
const axios = require('axios');
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const { getRandomUserAgent, getRandomProxy,getRandomDelay } = require('./utils.js');
const { parseHtml } = require('./parser.js');
puppeteer.use(StealthPlugin());
async function scrapeWithAxios(url, proxyList) {
const axiosConfig = {
headers: {
'User-Agent': getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
},
proxy: getRandomProxy(proxyList),
timeout: 30000,
};
const response = await axios.get(url, axiosConfig);
return response.data;
}
async function scrapeWithPuppeteer(url, proxyList) {
const browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox'],
headless: true,
});
const page = await browser.newPage();
await page.setUserAgent(getRandomUserAgent());
await page.setViewport({ width: 1920, height: 1080 });
await page.goto(url, { waitUntil: 'networkidle2' });
// Simulate scrolling
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 100;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
const content = await page.content();
await browser.close();
return content;
}
async function advancedEventScraper(url, proxyList) {
try {
// Random delay before starting
await new Promise(resolve => setTimeout(resolve, getRandomDelay(1000, 5000)));
let html;
try {
// First attempt with Axios
html = await scrapeWithAxios(url, proxyList);
} catch (axiosError) {
console.log('Axios scraping failed, falling back to Puppeteer');
// If Axios fails, fall back to Puppeteer
html = await scrapeWithPuppeteer(url, proxyList);
}
// Parse the HTML
const result = await parseHtml(html);
return result;
} catch (error) {
console.error('Error in advanced event scraper:', error);
throw error;
}
}
module.exports = { advancedEventScraper };