-
-
Notifications
You must be signed in to change notification settings - Fork 266
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add internet search integration
- Loading branch information
Showing
9 changed files
with
290 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
#https://docs.searxng.org/admin/searx.limiter.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
use_default_settings: true | ||
|
||
search: | ||
# Filter results. 0: None, 1: Moderate, 2: Strict | ||
safe_search: 0 | ||
# Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "yandex", "mwmbl", | ||
# "seznam", "startpage", "stract", "swisscows", "qwant", "wikipedia" - leave blank to turn it off | ||
# by default. | ||
autocomplete: 'google' | ||
# minimun characters to type before autocompleter starts | ||
autocomplete_min: 4 | ||
# Default search language - leave blank to detect from browser information or | ||
# use codes from 'languages.py' | ||
default_lang: 'auto' | ||
# max_page: 0 # if engine supports paging, 0 means unlimited numbers of pages | ||
# Available languages | ||
# languages: | ||
# - all | ||
# - en | ||
# - en-US | ||
# - de | ||
# - it-IT | ||
# - fr | ||
# - fr-BE | ||
# ban time in seconds after engine errors | ||
ban_time_on_fail: 5 | ||
# max ban time in seconds after engine errors | ||
max_ban_time_on_fail: 120 | ||
suspended_times: | ||
# Engine suspension time after error (in seconds; set to 0 to disable) | ||
# For error "Access denied" and "HTTP error [402, 403]" | ||
SearxEngineAccessDenied: 86400 | ||
# For error "CAPTCHA" | ||
SearxEngineCaptcha: 86400 | ||
# For error "Too many request" and "HTTP error 429" | ||
SearxEngineTooManyRequests: 3600 | ||
# Cloudflare CAPTCHA | ||
cf_SearxEngineCaptcha: 1296000 | ||
cf_SearxEngineAccessDenied: 86400 | ||
# ReCAPTCHA | ||
recaptcha_SearxEngineCaptcha: 604800 | ||
|
||
# remove format to deny access, use lower case. | ||
# formats: [html, csv, json, rss] | ||
formats: | ||
- html | ||
- json | ||
|
||
server: | ||
# Is overwritten by ${SEARXNG_PORT} and ${SEARXNG_BIND_ADDRESS} | ||
port: 8888 | ||
bind_address: '0.0.0.0' | ||
# public URL of the instance, to ensure correct inbound links. Is overwritten | ||
# by ${SEARXNG_URL}. | ||
base_url: false # "http://example.com/location" | ||
# rate limit the number of request on the instance, block some bots. | ||
# Is overwritten by ${SEARXNG_LIMITER} | ||
limiter: false | ||
# enable features designed only for public instances. | ||
# Is overwritten by ${SEARXNG_PUBLIC_INSTANCE} | ||
public_instance: false | ||
|
||
# If your instance owns a /etc/searxng/settings.yml file, then set the following | ||
# values there. | ||
|
||
secret_key: 'KDzXs0qvZdoZnzW7Eq4jhubjgTWayRM' # Is overwritten by ${SEARXNG_SECRET} | ||
# Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY} | ||
image_proxy: false | ||
# 1.0 and 1.1 are supported | ||
http_protocol_version: '1.0' | ||
# POST queries are more secure as they don't show up in history but may cause | ||
# problems when using Firefox containers | ||
method: 'POST' | ||
default_http_headers: | ||
X-Content-Type-Options: nosniff | ||
X-Download-Options: noopen | ||
X-Robots-Tag: noindex, nofollow | ||
Referrer-Policy: no-referrer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
-- AlterTable | ||
ALTER TABLE "User" ADD COLUMN "isSuspended" BOOLEAN NOT NULL DEFAULT false; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import * as cheerio from "cheerio"; | ||
import { Embeddings } from "@langchain/core/embeddings"; | ||
import { Document } from "@langchain/core/documents"; | ||
import * as ml_distance from "ml-distance" | ||
|
||
const SERACH_PROVIDER = process.env.DB_SEARCH_PROVIDER || "default"; | ||
const TOTAL_RESULTS_LIMIT = process.env.DB_TOTAL_RESULTS_LIMIT ? parseInt(process.env.DB_TOTAL_RESULTS_LIMIT) : 5; | ||
|
||
export const duckduckgoSearchUnOffical = async (query: string) => { | ||
const abortController = new AbortController(); | ||
setTimeout(() => abortController.abort(), 10000); | ||
|
||
const htmlString = await fetch( | ||
"https://html.duckduckgo.com/html/?q=" + query, | ||
{ | ||
signal: abortController.signal, | ||
} | ||
) | ||
.then((response) => response.text()) | ||
.catch(); | ||
|
||
const $ = cheerio.load(htmlString); | ||
|
||
const searchResults = Array.from($("div.results_links_deep")).map( | ||
(result) => { | ||
const title = $(result).find("a.result__a").text(); | ||
const link = $(result) | ||
.find("a.result__snippet") | ||
.attr("href") | ||
.replace("//duckduckgo.com/l/?uddg=", "") | ||
.replace(/&rut=.*/, ""); | ||
|
||
const content = $(result).find("a.result__snippet").text(); | ||
const decodedLink = decodeURIComponent(link); | ||
return { title, link: decodedLink, content }; | ||
} | ||
); | ||
|
||
return searchResults; | ||
}; | ||
|
||
export const googleSearchUnOffical = async (query: string) => { | ||
const abortController = new AbortController(); | ||
setTimeout(() => abortController.abort(), 10000); | ||
|
||
const htmlString = await fetch( | ||
"https://www.google.com/search?hl=en&q=" + query, | ||
{ | ||
signal: abortController.signal, | ||
} | ||
) | ||
.then((response) => response.text()) | ||
.catch(); | ||
|
||
const $ = cheerio.load(htmlString); | ||
|
||
const searchResults = $("div.g").map((_, result) => { | ||
const title = $(result).find("h3").text(); | ||
const link = $(result).find("a").attr("href"); | ||
const content = $(result).find("span").map((_, span) => $(span).text()).get().join(" "); | ||
return { title, link, content }; | ||
}).get(); | ||
|
||
return searchResults; | ||
}; | ||
|
||
export const searxngSearch = async (query: string) => { | ||
const abortController = new AbortController(); | ||
setTimeout(() => abortController.abort(), 10000); | ||
|
||
const searxngUrl = process.env.DB_SEARXNG_URL; | ||
|
||
if (!searxngUrl) { | ||
throw new Error("SEARXNG_URL is not set"); | ||
} | ||
const url = new URL(`${searxngUrl}/search`); | ||
|
||
url.searchParams.append("q", query); | ||
url.searchParams.append("format", "json"); | ||
const response = await fetch(url.toString(), { | ||
method: "GET", | ||
headers: { | ||
Accept: "application/json", | ||
}, | ||
}); | ||
|
||
if (!response.ok) { | ||
const err = await response.json(); | ||
console.error(`Error: ${err}`); | ||
throw new Error(`Error: ${response.status}`); | ||
} | ||
|
||
const data = (await response.json()) as { | ||
results: { | ||
title: string; | ||
url: string; | ||
content: string; | ||
}[]; | ||
}; | ||
|
||
return data.results.map((result) => ({ | ||
title: result.title, | ||
link: result.url, | ||
content: result.content, | ||
})); | ||
}; | ||
|
||
const searchProviders = { | ||
duckduckgo: duckduckgoSearchUnOffical, | ||
google: googleSearchUnOffical, | ||
searxng: searxngSearch, | ||
default: | ||
process.env.IS_RAILWAY != "true" | ||
? searxngSearch | ||
: duckduckgoSearchUnOffical, | ||
}; | ||
|
||
export const searchInternet = async (embedding: Embeddings, { query }: { query: string }) => { | ||
const searchProvider = searchProviders[SERACH_PROVIDER]; | ||
if (!searchProvider) { | ||
throw new Error(`Search provider ${SERACH_PROVIDER} not found`); | ||
} | ||
const datat = await searchProvider(query); | ||
|
||
const results = datat.slice(0, TOTAL_RESULTS_LIMIT); | ||
|
||
const [docEmbeddings, queryEmbedding] = await Promise.all([ | ||
embedding.embedDocuments(results.map((doc) => doc.content)), | ||
embedding.embedQuery(query), | ||
]); | ||
|
||
|
||
const similarity = docEmbeddings.map((docEmbedding, i) => { | ||
const sim = ml_distance.similarity.cosine(queryEmbedding, docEmbedding) | ||
|
||
return { | ||
index: i, | ||
similarity: sim | ||
} | ||
}) | ||
|
||
const sortedDocs = similarity | ||
.sort((a, b) => b.similarity - a.similarity) | ||
.filter((sim) => sim.similarity > 0.5) | ||
.slice(0, 15) | ||
.map((sim) => { | ||
return [ | ||
{ | ||
pageContent: results[sim.index]?.content || "", | ||
metadata: { | ||
source: results[sim.index]?.link || "", | ||
} | ||
} as Document, | ||
sim.similarity | ||
] | ||
}) | ||
|
||
return sortedDocs; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters