diff --git a/Dockerfile b/Dockerfile index 075a9cab..150078cd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,6 +31,14 @@ RUN yarn config set registry https://registry.npmjs.org/ RUN yarn config set network-timeout 1200000 RUN apt update && apt -y install --no-install-recommends ca-certificates git git-lfs openssh-client curl jq cmake sqlite3 openssl psmisc python3 + +RUN apt-get update && apt-get install gnupg wget -y && \ + wget --quiet --output-document=- https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/google-archive.gpg && \ + sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' && \ + apt-get update && \ + apt-get install google-chrome-stable -y --no-install-recommends && \ + rm -rf /var/lib/apt/lists/* + RUN apt -y install g++ make # RUN npm install -g node-gyp RUN apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/{apt,dpkg,cache,log}/ @@ -51,4 +59,6 @@ RUN yarn install --production --frozen-lockfile ENV NODE_ENV=production +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable + CMD ["yarn", "start"] \ No newline at end of file diff --git a/server/src/utils/crawl.ts b/server/src/utils/crawl.ts index ee131263..4f2508b5 100644 --- a/server/src/utils/crawl.ts +++ b/server/src/utils/crawl.ts @@ -20,7 +20,7 @@ export const crawl = async ( while (queue.length > 0 && visitedLinks.size < maxLinks) { const batch = queue.splice(0, Math.min(queue.length, maxLinks - visitedLinks.size)); - + await Promise.all( batch.map(async ({ url, depth }) => { if (visitedLinks.has(url) || depth > maxDepth) { @@ -29,7 +29,10 @@ export const crawl = async ( try { const response = await axios.get(url, { - headers: { Accept: "text/html" }, + headers: { + Accept: "text/html", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", + }, }); const contentType = response.headers['content-type'];