From d0532ea8d31198479fd0344086ed4137d1492875 Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sat, 21 Dec 2024 23:41:07 +0300 Subject: [PATCH 1/7] DDGS.images: remove multithreading --- duckduckgo_search/duckduckgo_search.py | 27 ++++++++++---------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index bb38bfd..d6f8a5e 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -516,13 +516,11 @@ def images( cache = set() results: list[dict[str, str]] = [] - def _images_page(s: int) -> list[dict[str, str]]: - payload["s"] = f"{s}" + for _ in range(5): resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload) resp_json = json_loads(resp_content) - page_data = resp_json.get("results", []) - page_results = [] + for row in page_data: image_url = row.get("image") if image_url and image_url not in cache: @@ -536,20 +534,15 @@ def _images_page(s: int) -> list[dict[str, str]]: "width": row["width"], "source": row["source"], } - page_results.append(result) - return page_results - - slist = [0] - if max_results: - max_results = min(max_results, 500) - slist.extend(range(100, max_results, 100)) - try: - for r in self._executor.map(_images_page, slist): - results.extend(r) - except Exception as e: - raise e + results.append(result) + if max_results and len(results) >= max_results: + return results + next = resp_json.get("next") + if next is None: + return results + payload["s"] = next.split("s=")[-1].split("&")[0] - return list(islice(results, max_results)) + return results def videos( self, From c157aa313680cd5c3c52ae949d92f827a520902a Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sat, 21 Dec 2024 23:48:25 +0300 Subject: [PATCH 2/7] DDGS.videos: remove multithreading --- duckduckgo_search/duckduckgo_search.py | 27 ++++++++++---------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index d6f8a5e..ed7da73 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -596,30 +596,23 @@ def videos( cache = set() results: list[dict[str, str]] = [] - def _videos_page(s: int) -> list[dict[str, str]]: - payload["s"] = f"{s}" + for _ in range(8): resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload) resp_json = json_loads(resp_content) - page_data = resp_json.get("results", []) - page_results = [] + for row in page_data: if row["content"] not in cache: cache.add(row["content"]) - page_results.append(row) - return page_results - - slist = [0] - if max_results: - max_results = min(max_results, 200) - slist.extend(range(60, max_results, 60)) - try: - for r in self._executor.map(_videos_page, slist): - results.extend(r) - except Exception as e: - raise e + results.append(row) + if max_results and len(results) >= max_results: + return results + next = resp_json.get("next") + if next is None: + return results + payload["s"] = next.split("s=")[-1].split("&")[0] - return list(islice(results, max_results)) + return results def news( self, From f695a21406c7d7fc507eb329abfa35b572c6a06c Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sat, 21 Dec 2024 23:52:29 +0300 Subject: [PATCH 3/7] DDGS.news: remove multithreading --- duckduckgo_search/duckduckgo_search.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index ed7da73..a70b259 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -658,12 +658,11 @@ def news( cache = set() results: list[dict[str, str]] = [] - def _news_page(s: int) -> list[dict[str, str]]: - payload["s"] = f"{s}" + for _ in range(5): resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload) resp_json = json_loads(resp_content) page_data = resp_json.get("results", []) - page_results = [] + for row in page_data: if row["url"] not in cache: cache.add(row["url"]) @@ -676,17 +675,13 @@ def _news_page(s: int) -> list[dict[str, str]]: "image": _normalize_url(image_url), "source": row["source"], } - page_results.append(result) - return page_results + results.append(result) + if max_results and len(results) >= max_results: + return results - slist = [0] - if max_results: - max_results = min(max_results, 120) - slist.extend(range(30, max_results, 30)) - try: - for r in self._executor.map(_news_page, slist): - results.extend(r) - except Exception as e: - raise e + next = resp_json.get("next") + if next is None: + return results + payload["s"] = next.split("s=")[-1].split("&")[0] - return list(islice(results, max_results)) + return results From ce47395ca50623c0104ebff72945560cf5e03e97 Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sun, 22 Dec 2024 00:39:37 +0300 Subject: [PATCH 4/7] Remove dead code --- duckduckgo_search/duckduckgo_search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index a70b259..b6d09f0 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -3,10 +3,9 @@ import logging import os import warnings -from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timezone from functools import cached_property -from itertools import cycle, islice +from itertools import cycle from random import choice from time import sleep, time from types import TracebackType @@ -39,7 +38,6 @@ class DDGS: """DuckDuckgo_search class to get search results from duckduckgo.com.""" - _executor: ThreadPoolExecutor = ThreadPoolExecutor() _impersonates = ( "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", "chrome_108", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119", "chrome_120", From 829d2129152566e130a7f9a2736a9f9d605f810a Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sun, 22 Dec 2024 11:05:44 +0300 Subject: [PATCH 5/7] DDGS.text: add `auto` backend; add lxml to dependencies --- README.md | 8 ++--- duckduckgo_search/cli.py | 2 +- duckduckgo_search/duckduckgo_search.py | 49 ++++++++++++++------------ pyproject.toml | 4 +-- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 2b88e44..6a241dc 100755 --- a/README.md +++ b/README.md @@ -22,9 +22,6 @@ AI chat and search for text, news, images and videos using the DuckDuckGo.com se ```python pip install -U duckduckgo_search ``` -> [!NOTE] -> you can install lxml to use the `text` function with `backend='html'` or `backend='lite'` (size ≈ 12Mb)
-> `pip install -U duckduckgo_search[lxml]` ## CLI version @@ -235,7 +232,7 @@ def text( region: str = "wt-wt", safesearch: str = "moderate", timelimit: str | None = None, - backend: str = "api", + backend: str = "auto", max_results: int | None = None, ) -> list[dict[str, str]]: """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params. @@ -245,7 +242,8 @@ def text( region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: d, w, m, y. Defaults to None. - backend: api, html, lite. Defaults to api. + backend: auto, api, html, lite. Defaults to auto. + auto - try all backends in random order, api - collect data from https://duckduckgo.com, html - collect data from https://html.duckduckgo.com, lite - collect data from https://lite.duckduckgo.com. diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py index f6b1326..73fc3ed 100644 --- a/duckduckgo_search/cli.py +++ b/duckduckgo_search/cli.py @@ -197,7 +197,7 @@ def chat(load, proxy, multiline, timeout, verify, model): @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)") @click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory") @click.option("-dd", "--download-directory", help="Specify custom download directory") -@click.option("-b", "--backend", default="api", type=click.Choice(["api", "html", "lite"]), help="which backend to use") +@click.option("-b", "--backend", default="auto", type=click.Choice(["auto", "api", "html", "lite"])) @click.option("-th", "--threads", default=10, help="download threads, default=10") @click.option("-p", "--proxy", help="the proxy to send requests, example: socks5://127.0.0.1:9150") @click.option("-v", "--verify", default=True, help="verify SSL when making the request") diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index b6d09f0..9242fe8 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -6,21 +6,15 @@ from datetime import datetime, timezone from functools import cached_property from itertools import cycle -from random import choice +from random import choice, shuffle from time import sleep, time from types import TracebackType from typing import cast import primp # type: ignore - -try: - from lxml.etree import _Element - from lxml.html import HTMLParser as LHTMLParser - from lxml.html import document_fromstring - - LXML_AVAILABLE = True -except ImportError: - LXML_AVAILABLE = False +from lxml.etree import _Element +from lxml.html import HTMLParser as LHTMLParser +from lxml.html import document_fromstring from .exceptions import ConversationLimitException, DuckDuckGoSearchException, RatelimitException, TimeoutException from .utils import ( @@ -213,7 +207,7 @@ def text( region: str = "wt-wt", safesearch: str = "moderate", timelimit: str | None = None, - backend: str = "api", + backend: str = "auto", max_results: int | None = None, ) -> list[dict[str, str]]: """DuckDuckGo text search. Query params: https://duckduckgo.com/params. @@ -223,7 +217,8 @@ def text( region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: d, w, m, y. Defaults to None. - backend: api, html, lite. Defaults to api. + backend: auto, api, html, lite. Defaults to auto. + auto - try all backends in random order, api - collect data from https://duckduckgo.com, html - collect data from https://html.duckduckgo.com, lite - collect data from https://lite.duckduckgo.com. @@ -237,17 +232,25 @@ def text( RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits. TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts. """ - if LXML_AVAILABLE is False and backend != "api": - backend = "api" - warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2) - - if backend == "api": - results = self._text_api(keywords, region, safesearch, timelimit, max_results) - elif backend == "html": - results = self._text_html(keywords, region, timelimit, max_results) - elif backend == "lite": - results = self._text_lite(keywords, region, timelimit, max_results) - return results + + backends = ["api", "html", "lite"] if backend == "auto" else [backend] + shuffle(backends) + + results, err = [], None + for b in backends: + try: + if b == "api": + results = self._text_api(keywords, region, safesearch, timelimit, max_results) + elif b == "html": + results = self._text_html(keywords, region, timelimit, max_results) + elif b == "lite": + results = self._text_lite(keywords, region, timelimit, max_results) + return results + except Exception as ex: + logger.info(f"Error to search using {b} backend: {ex}") + err = ex + + raise DuckDuckGoSearchException(err) def _text_api( self, diff --git a/pyproject.toml b/pyproject.toml index 262f180..b6ab9b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ classifiers = [ dependencies = [ "click>=8.1.7", "primp>=0.9.1", + "lxml>=5.3.0", ] dynamic = ["version"] @@ -44,9 +45,6 @@ ddgs = "duckduckgo_search.cli:safe_entry_point" version = {attr = "duckduckgo_search.version.__version__"} [project.optional-dependencies] -lxml = [ - "lxml>=5.3.0", -] dev = [ "mypy>=1.13.0", "pytest>=8.3.4", From ee1dce3cb4d1101b4109cdae28bc0842dda66cbb Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sun, 22 Dec 2024 11:07:15 +0300 Subject: [PATCH 6/7] tests: update --- pyproject.toml | 1 + tests/test_cli.py | 66 ++++++++++++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b6ab9b7..6d86859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ version = {attr = "duckduckgo_search.version.__version__"} dev = [ "mypy>=1.13.0", "pytest>=8.3.4", + "pytest-dependency>=0.6.0", "ruff>=0.8.3", ] diff --git a/tests/test_cli.py b/tests/test_cli.py index b072562..2b88e29 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,5 @@ import os +import pathlib import shutil import time @@ -9,7 +10,8 @@ from duckduckgo_search.cli import _download_results, _save_csv, _save_json, cli runner = CliRunner() - +TEXT_RESULTS = None +IMAGES_RESULTS = None @pytest.fixture(autouse=True) def pause_between_tests(): @@ -46,43 +48,49 @@ def test_videos_command(): assert "title" in result.output -def test_save_csv(tmp_path): - keywords = "cat" - with DDGS() as ddgs: - results = ddgs.text(keywords, max_results=10) - assert 5 <= len(results) <= 10 +@pytest.mark.dependency() +def test_get_text(): + global TEXT_RESULTS + TEXT_RESULTS = DDGS().text("test") + assert TEXT_RESULTS + + +@pytest.mark.dependency() +def test_get_images(): + global IMAGES_RESULTS + IMAGES_RESULTS = DDGS().images("test") + assert IMAGES_RESULTS + - temp_file = tmp_path / f"{keywords}.csv" - _save_csv(temp_file, results) +@pytest.mark.dependency(depends=["test_get_data"]) +def test_save_csv(tmp_path): + temp_file = tmp_path / "test_csv.csv" + _save_csv(temp_file, RESULTS) assert temp_file.exists() +@pytest.mark.dependency(depends=["test_get_data"]) def test_save_json(tmp_path): - keywords = "dog" - with DDGS() as ddgs: - results = ddgs.text(keywords, max_results=10) - assert 5 <= len(results) <= 10 - - temp_file = tmp_path / f"{keywords}.json" - _save_json(temp_file, results) + temp_file = tmp_path / "test_json.json" + _save_json(temp_file, RESULTS) assert temp_file.exists() +@pytest.mark.dependency(depends=["test_get_data"]) def test_text_download(): - keywords = "sea" - with DDGS() as ddgs: - results = ddgs.text(keywords, max_results=8) - assert 5 <= len(results) <= 8 - - _download_results(keywords, results, function_name="text", pathname="text_downloads") - shutil.rmtree("text_downloads") + pathname = pathlib.Path("text_downloads") + _download_results(test_text_download, TEXT_RESULTS, function_name="text", pathname=str(pathname)) + assert pathname.is_dir() and pathname.iterdir() + for file in pathname.iterdir(): + assert file.is_file() + shutil.rmtree(str(pathname)) +@pytest.mark.dependency(depends=["test_get_images"]) def test_images_download(): - keywords = "sky" - with DDGS() as ddgs: - results = ddgs.images(keywords, max_results=8) - assert len(results) >= 8 - - _download_results(keywords, results, function_name="images", pathname="images_downloads") - shutil.rmtree("images_downloads") + pathname = pathlib.Path("images_downloads") + _download_results(test_images_download, IMAGES_RESULTS, function_name="images", pathname=str(pathname)) + assert pathname.is_dir() and pathname.iterdir() + for file in pathname.iterdir(): + assert file.is_file() + shutil.rmtree(str(pathname)) From da91fff5bb3e81031623109d92f8d608fdc9bb85 Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sun, 22 Dec 2024 11:10:00 +0300 Subject: [PATCH 7/7] Bugfix iterating if max_results is None --- duckduckgo_search/duckduckgo_search.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index 9242fe8..712db94 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -299,7 +299,7 @@ def _text_api( return results else: next_page_url = row.get("n") - if not next_page_url: + if not next_page_url or not max_results: return results payload["s"] = next_page_url.split("s=")[1].split("&")[0] return results @@ -365,7 +365,7 @@ def _text_html( return results npx = tree.xpath('.//div[@class="nav-link"]') - if not npx: + if not npx or not max_results: return results next_page = npx[-1] if isinstance(npx, list) else None if isinstance(next_page, _Element): @@ -447,7 +447,7 @@ def _text_lite( return results next_page_s = tree.xpath("//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value") - if not next_page_s: + if not next_page_s or not max_results: return results elif isinstance(next_page_s, list): payload["s"] = str(next_page_s[0]) @@ -539,7 +539,7 @@ def images( if max_results and len(results) >= max_results: return results next = resp_json.get("next") - if next is None: + if next is None or not max_results: return results payload["s"] = next.split("s=")[-1].split("&")[0] @@ -609,7 +609,7 @@ def videos( if max_results and len(results) >= max_results: return results next = resp_json.get("next") - if next is None: + if next is None or not max_results: return results payload["s"] = next.split("s=")[-1].split("&")[0] @@ -681,7 +681,7 @@ def news( return results next = resp_json.get("next") - if next is None: + if next is None or not max_results: return results payload["s"] = next.split("s=")[-1].split("&")[0]