Skip to content

Commit

Permalink
feat: Add usePuppeteerFetch option to QSource and DialoqbaseSettings
Browse files Browse the repository at this point in the history
  • Loading branch information
n4ze3m committed Jun 17, 2024
1 parent e92371f commit fd0a922
Show file tree
Hide file tree
Showing 14 changed files with 136 additions and 79 deletions.
7 changes: 7 additions & 0 deletions app/ui/src/routes/settings/application.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,13 @@ export default function SettingsApplicationRoot() {
>
<Switch />
</Form.Item>
<Form.Item
label="Enhanced Website loader"
name="usePuppeteerFetch"
valuePropName="checked"
>
<Switch />
</Form.Item>
</div>
<div className="bg-gray-50 border-x border-b rounded-b-md rounded-x-md px-4 py-3 text-right sm:px-6 dark:bg-[#141414] dark:border-gray-600">
<button
Expand Down
2 changes: 2 additions & 0 deletions server/prisma/migrations/20240617050823_q_12_4/migration.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "DialoqbaseSettings" ADD COLUMN "usePuppeteerFetch" BOOLEAN DEFAULT false;
1 change: 1 addition & 0 deletions server/prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ model DialoqbaseSettings {
defaultChatModel String @default("gpt-3.5-turbo-dbase")
defaultEmbeddingModel String @default("dialoqbase_eb_text-embedding-ada-002")
ollamaURL String? @default("http://host.docker.internal:11434")
usePuppeteerFetch Boolean? @default(false)
}

model BotIntegration {
Expand Down
116 changes: 58 additions & 58 deletions server/prisma/seed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -488,70 +488,70 @@ const removeTensorflowSupport = async () => {
});
};

const replaceOldEmbeddings = async () => {
await prisma.bot.updateMany({
where: {
embedding: "openai",
},
data: {
embedding: "dialoqbase_eb_text-embedding-ada-002",
},
});
// const replaceOldEmbeddings = async () => {
// await prisma.bot.updateMany({
// where: {
// embedding: "openai",
// },
// data: {
// embedding: "dialoqbase_eb_text-embedding-ada-002",
// },
// });

await prisma.bot.updateMany({
where: {
embedding: "cohere",
},
data: {
embedding: "dialoqbase_eb_small",
},
});
// await prisma.bot.updateMany({
// where: {
// embedding: "cohere",
// },
// data: {
// embedding: "dialoqbase_eb_small",
// },
// });

await prisma.bot.updateMany({
where: {
embedding: "transformer",
},
data: {
embedding: "dialoqbase_eb_Xenova/all-MiniLM-L6-v2",
},
});
// await prisma.bot.updateMany({
// where: {
// embedding: "transformer",
// },
// data: {
// embedding: "dialoqbase_eb_Xenova/all-MiniLM-L6-v2",
// },
// });

await prisma.bot.updateMany({
where: {
embedding: "google-gecko",
},
data: {
embedding: "dialoqbase_eb_models/embedding-gecko-001",
},
});
// await prisma.bot.updateMany({
// where: {
// embedding: "google-gecko",
// },
// data: {
// embedding: "dialoqbase_eb_models/embedding-gecko-001",
// },
// });

await prisma.bot.updateMany({
where: {
embedding: "jina-api",
},
data: {
embedding: "dialoqbase_eb_jina-embeddings-v2-base-en",
},
});
// await prisma.bot.updateMany({
// where: {
// embedding: "jina-api",
// },
// data: {
// embedding: "dialoqbase_eb_jina-embeddings-v2-base-en",
// },
// });

await prisma.bot.updateMany({
where: {
embedding: "jina",
},
data: {
embedding: "dialoqbase_eb_Xenova/jina-embeddings-v2-small-en",
},
});
// await prisma.bot.updateMany({
// where: {
// embedding: "jina",
// },
// data: {
// embedding: "dialoqbase_eb_Xenova/jina-embeddings-v2-small-en",
// },
// });

await prisma.bot.updateMany({
where: {
embedding: "google",
},
data: {
embedding: "dialoqbase_eb_embedding-001",
},
});
};
// await prisma.bot.updateMany({
// where: {
// embedding: "google",
// },
// data: {
// embedding: "dialoqbase_eb_embedding-001",
// },
// });
// };

const updateGeminiStreamingToTrue = async () => {
await prisma.dialoqbaseModels.update({
Expand Down
1 change: 1 addition & 0 deletions server/src/handlers/api/v1/admin/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ export type UpdateDialoqbaseSettingsRequest = {
noOfBotsPerUser: number;
allowUserToCreateBots: boolean;
allowUserToRegister: boolean;
usePuppeteerFetch: boolean;
};
};

Expand Down
19 changes: 16 additions & 3 deletions server/src/loader/web.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
import { BaseDocumentLoader } from "langchain/document_loaders/base";
import { Document } from "langchain/document";
import { websiteParser } from "../utils/website-parser";
// import puppeteerFetch from "../utils/puppeteer-fetch";
import puppeteerFetch, { closePuppeteer } from "../utils/puppeteer-fetch";

export interface WebLoaderParams {
url: string;
usePuppeteerFetch?: boolean;
doNotClosePuppeteer?: boolean;
}

export class DialoqbaseWebLoader
extends BaseDocumentLoader
implements WebLoaderParams {
url: string;
usePuppeteerFetch?: boolean;
doNotClosePuppeteer?: boolean;

constructor({ url }: WebLoaderParams) {
constructor({ url, usePuppeteerFetch, doNotClosePuppeteer }: WebLoaderParams) {
super();
this.url = url;
this.usePuppeteerFetch = usePuppeteerFetch;
this.doNotClosePuppeteer = doNotClosePuppeteer;
}

async _fetchHTML(): Promise<string> {
if (this.usePuppeteerFetch) {
console.log(`[DialoqbaseWebLoader] Using puppeteer to fetch ${this.url}`)
const response = await puppeteerFetch(this.url, true);
if (!this.doNotClosePuppeteer) {
await closePuppeteer();
}
return response;
}
const response = await fetch(this.url);
return await response.text();
}

async load(): Promise<Document<Record<string, any>>[]> {
const html = await this._fetchHTML();
const text = websiteParser(html);
console.log(text)
const metadata = { source: this.url };
return [new Document({ pageContent: text, metadata })];
}
Expand Down
7 changes: 6 additions & 1 deletion server/src/queue/controllers/crawl.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ import { PrismaClient } from "@prisma/client";
import { QSource } from "../type";
import { crawl } from "../../utils/crawl";
import { websiteQueueController } from "./website.controller";
import { closePuppeteer } from "../../utils/puppeteer-fetch";
const prisma = new PrismaClient();

export const crawlQueueController = async (source: QSource) => {
let maxDepth = source.maxDepth || 1;
let maxLinks = source.maxLinks || 1;
const data = await crawl(source.content!, maxDepth, maxLinks);
const data = await crawl(source.content!, maxDepth, maxLinks, source.usePuppeteerFetch);
const links = Array.from(data?.links || []);

for (const link of links) {
Expand All @@ -27,6 +28,8 @@ export const crawlQueueController = async (source: QSource) => {
embedding: source.embedding,
chunkOverlap: source.chunkOverlap,
chunkSize: source.chunkSize,
usePuppeteerFetch: source.usePuppeteerFetch,
doNotClosePuppeteer: true,
},
prisma
);
Expand All @@ -41,4 +44,6 @@ export const crawlQueueController = async (source: QSource) => {
},
});
}

await closePuppeteer()
};
2 changes: 2 additions & 0 deletions server/src/queue/controllers/website.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ export const websiteQueueController = async (
} else {
const loader = new DialoqbaseWebLoader({
url: source.content!,
usePuppeteerFetch: source.usePuppeteerFetch,
doNotClosePuppeteer: source.doNotClosePuppeteer,
});
docs = await loader.load();
}
Expand Down
3 changes: 2 additions & 1 deletion server/src/queue/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ export default async function queueHandler(job: SandboxedJob) {
status: "PROCESSING",
},
});
const { chunkOverlap, chunkSize } = await getRagSettings(prisma);
const { chunkOverlap, chunkSize , usePuppeteerFetch} = await getRagSettings(prisma);
source.chunkOverlap = chunkOverlap;
source.chunkSize = chunkSize;
source.usePuppeteerFetch = usePuppeteerFetch;
switch (source.type.toLowerCase()) {
case "website":
await websiteQueueController(source, prisma);
Expand Down
2 changes: 2 additions & 0 deletions server/src/queue/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ export interface QSource extends BotSource {
maxLinks?: number;
chunkSize: number;
chunkOverlap: number;
usePuppeteerFetch?: boolean;
doNotClosePuppeteer?: boolean;
}
2 changes: 2 additions & 0 deletions server/src/schema/api/v1/admin/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export const dialoqbaseSettingsSchema: FastifySchema = {
dynamicallyFetchOllamaModels: { type: "boolean" },
hideDefaultModels: { type: "boolean" },
ollamaURL: { type: "string" },
usePuppeteerFetch: { type: "boolean" },
},
},
};
Expand All @@ -47,6 +48,7 @@ export const updateDialoqbaseSettingsSchema: FastifySchema = {
defaultEmbeddingModel: { type: "string" },
hideDefaultModels: { type: "boolean" },
ollamaURL: { type: "string" },
usePuppeteerFetch: { type: "boolean" },
},
},
response: {
Expand Down
35 changes: 22 additions & 13 deletions server/src/utils/crawl.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import axios from "axios";
import { load } from "cheerio";
import { CheerioAPI, load } from "cheerio";
import puppeteerFetch from "./puppeteer-fetch";

type CrawlResult = {
links: Set<string>;
Expand All @@ -13,7 +14,8 @@ const queuedLinks: Set<string> = new Set();
export const crawl = async (
startUrl: string,
maxDepth = 2,
maxLinks = 20
maxLinks = 20,
usePuppeteerFetch = false
): Promise<CrawlResult> => {
const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }];
const fetchedLinks: Set<string> = new Set();
Expand All @@ -28,20 +30,27 @@ export const crawl = async (
}

try {
const response = await axios.get(url, {
headers: {
Accept: "text/html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
},
});

const contentType = response.headers['content-type'];
if (!contentType || !contentType.includes("text/html")) {
return;
}
let $: CheerioAPI;

if (usePuppeteerFetch) {
const response = await puppeteerFetch(url);
$ = load(response);
} else {
const response = await axios.get(url, {
headers: {
Accept: "text/html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
},
});

const $ = load(response.data);
const contentType = response.headers['content-type'];
if (!contentType || !contentType.includes("text/html")) {
return;
}

$ = load(response.data);
}
visitedLinks.add(url);
fetchedLinks.add(url);

Expand Down
16 changes: 13 additions & 3 deletions server/src/utils/puppeteer-fetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ let browser: Browser;


const init = async () => {
if (!browser) {
if (!browser || !browser.connected) {
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
Expand All @@ -52,10 +52,10 @@ const puppeteerFetch = async (url: string, useReadability = false) => {
${executor}
return executor();
}())
`) as { content?: string }
`) as { content?: string, title?: string };
if (resultArticle?.content) {
await page.close();
return resultArticle.content;
return `<!DOCTYPE html><html><head><title>${resultArticle.title}</title></head><body>${resultArticle.content}</body></html>`
}
console.error(`[puppeteerFetch] Error fetching ${url}: Readability failed`);
}
Expand All @@ -68,6 +68,16 @@ const puppeteerFetch = async (url: string, useReadability = false) => {
}
}

export const closePuppeteer = async () => {
try {
if (browser.connected) {
await browser.close();
}
} catch (error) {
console.error(`[closePuppeteer] Error closing browser: ${error.message}`);
}
}



export default puppeteerFetch;
2 changes: 2 additions & 0 deletions server/src/utils/rag-settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ export const getRagSettings = async (prisma: PrismaClient) => {
select: {
defaultChunkSize: true,
defaultChunkOverlap: true,
usePuppeteerFetch: true,
},
});

return {
chunkSize: data?.defaultChunkSize || 1000,
chunkOverlap: data?.defaultChunkOverlap || 200,
usePuppeteerFetch: data?.usePuppeteerFetch
};
};

0 comments on commit fd0a922

Please sign in to comment.