From d0532ea8d31198479fd0344086ed4137d1492875 Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sat, 21 Dec 2024 23:41:07 +0300
Subject: [PATCH 1/7] DDGS.images: remove multithreading

---
 duckduckgo_search/duckduckgo_search.py | 27 ++++++++++----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
index bb38bfd..d6f8a5e 100644
--- a/duckduckgo_search/duckduckgo_search.py
+++ b/duckduckgo_search/duckduckgo_search.py
@@ -516,13 +516,11 @@ def images(
         cache = set()
         results: list[dict[str, str]] = []
 
-        def _images_page(s: int) -> list[dict[str, str]]:
-            payload["s"] = f"{s}"
+        for _ in range(5):
             resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
             resp_json = json_loads(resp_content)
-
             page_data = resp_json.get("results", [])
-            page_results = []
+
             for row in page_data:
                 image_url = row.get("image")
                 if image_url and image_url not in cache:
@@ -536,20 +534,15 @@ def _images_page(s: int) -> list[dict[str, str]]:
                         "width": row["width"],
                         "source": row["source"],
                     }
-                    page_results.append(result)
-            return page_results
-
-        slist = [0]
-        if max_results:
-            max_results = min(max_results, 500)
-            slist.extend(range(100, max_results, 100))
-        try:
-            for r in self._executor.map(_images_page, slist):
-                results.extend(r)
-        except Exception as e:
-            raise e
+                    results.append(result)
+                    if max_results and len(results) >= max_results:
+                        return results
+            next = resp_json.get("next")
+            if next is None:
+                return results
+            payload["s"] = next.split("s=")[-1].split("&")[0]
 
-        return list(islice(results, max_results))
+        return results
 
     def videos(
         self,

From c157aa313680cd5c3c52ae949d92f827a520902a Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sat, 21 Dec 2024 23:48:25 +0300
Subject: [PATCH 2/7] DDGS.videos: remove multithreading

---
 duckduckgo_search/duckduckgo_search.py | 27 ++++++++++----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
index d6f8a5e..ed7da73 100644
--- a/duckduckgo_search/duckduckgo_search.py
+++ b/duckduckgo_search/duckduckgo_search.py
@@ -596,30 +596,23 @@ def videos(
         cache = set()
         results: list[dict[str, str]] = []
 
-        def _videos_page(s: int) -> list[dict[str, str]]:
-            payload["s"] = f"{s}"
+        for _ in range(8):
             resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
             resp_json = json_loads(resp_content)
-
             page_data = resp_json.get("results", [])
-            page_results = []
+
             for row in page_data:
                 if row["content"] not in cache:
                     cache.add(row["content"])
-                    page_results.append(row)
-            return page_results
-
-        slist = [0]
-        if max_results:
-            max_results = min(max_results, 200)
-            slist.extend(range(60, max_results, 60))
-        try:
-            for r in self._executor.map(_videos_page, slist):
-                results.extend(r)
-        except Exception as e:
-            raise e
+                    results.append(row)
+                    if max_results and len(results) >= max_results:
+                        return results
+            next = resp_json.get("next")
+            if next is None:
+                return results
+            payload["s"] = next.split("s=")[-1].split("&")[0]
 
-        return list(islice(results, max_results))
+        return results
 
     def news(
         self,

From f695a21406c7d7fc507eb329abfa35b572c6a06c Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sat, 21 Dec 2024 23:52:29 +0300
Subject: [PATCH 3/7] DDGS.news: remove multithreading

---
 duckduckgo_search/duckduckgo_search.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
index ed7da73..a70b259 100644
--- a/duckduckgo_search/duckduckgo_search.py
+++ b/duckduckgo_search/duckduckgo_search.py
@@ -658,12 +658,11 @@ def news(
         cache = set()
         results: list[dict[str, str]] = []
 
-        def _news_page(s: int) -> list[dict[str, str]]:
-            payload["s"] = f"{s}"
+        for _ in range(5):
             resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
             resp_json = json_loads(resp_content)
             page_data = resp_json.get("results", [])
-            page_results = []
+
             for row in page_data:
                 if row["url"] not in cache:
                     cache.add(row["url"])
@@ -676,17 +675,13 @@ def _news_page(s: int) -> list[dict[str, str]]:
                         "image": _normalize_url(image_url),
                         "source": row["source"],
                     }
-                    page_results.append(result)
-            return page_results
+                    results.append(result)
+                    if max_results and len(results) >= max_results:
+                        return results
 
-        slist = [0]
-        if max_results:
-            max_results = min(max_results, 120)
-            slist.extend(range(30, max_results, 30))
-        try:
-            for r in self._executor.map(_news_page, slist):
-                results.extend(r)
-        except Exception as e:
-            raise e
+            next = resp_json.get("next")
+            if next is None:
+                return results
+            payload["s"] = next.split("s=")[-1].split("&")[0]
 
-        return list(islice(results, max_results))
+        return results

From ce47395ca50623c0104ebff72945560cf5e03e97 Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sun, 22 Dec 2024 00:39:37 +0300
Subject: [PATCH 4/7] Remove dead code

---
 duckduckgo_search/duckduckgo_search.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
index a70b259..b6d09f0 100644
--- a/duckduckgo_search/duckduckgo_search.py
+++ b/duckduckgo_search/duckduckgo_search.py
@@ -3,10 +3,9 @@
 import logging
 import os
 import warnings
-from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
 from functools import cached_property
-from itertools import cycle, islice
+from itertools import cycle
 from random import choice
 from time import sleep, time
 from types import TracebackType
@@ -39,7 +38,6 @@
 class DDGS:
     """DuckDuckgo_search class to get search results from duckduckgo.com."""
 
-    _executor: ThreadPoolExecutor = ThreadPoolExecutor()
     _impersonates = (
         "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", "chrome_108",
         "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119", "chrome_120",

From 829d2129152566e130a7f9a2736a9f9d605f810a Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sun, 22 Dec 2024 11:05:44 +0300
Subject: [PATCH 5/7] DDGS.text: add `auto` backend; add lxml to dependencies

---
 README.md                              |  8 ++---
 duckduckgo_search/cli.py               |  2 +-
 duckduckgo_search/duckduckgo_search.py | 49 ++++++++++++++------------
 pyproject.toml                         |  4 +--
 4 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 2b88e44..6a241dc 100755
--- a/README.md
+++ b/README.md
@@ -22,9 +22,6 @@ AI chat and search for text, news, images and videos using the DuckDuckGo.com se
 ```python
 pip install -U duckduckgo_search
 ```
-> [!NOTE]
-> you can install lxml to use the `text` function with `backend='html'` or `backend='lite'` (size ≈ 12Mb)</br>
-> `pip install -U duckduckgo_search[lxml]`
 
 ## CLI version
 
@@ -235,7 +232,7 @@ def text(
     region: str = "wt-wt",
     safesearch: str = "moderate",
     timelimit: str | None = None,
-    backend: str = "api",
+    backend: str = "auto",
     max_results: int | None = None,
 ) -> list[dict[str, str]]:
     """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params.
@@ -245,7 +242,8 @@ def text(
         region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
         safesearch: on, moderate, off. Defaults to "moderate".
         timelimit: d, w, m, y. Defaults to None.
-        backend: api, html, lite. Defaults to api.
+        backend: auto, api, html, lite. Defaults to auto.
+            auto - try all backends in random order,
             api - collect data from https://duckduckgo.com,
             html - collect data from https://html.duckduckgo.com,
             lite - collect data from https://lite.duckduckgo.com.
diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
index f6b1326..73fc3ed 100644
--- a/duckduckgo_search/cli.py
+++ b/duckduckgo_search/cli.py
@@ -197,7 +197,7 @@ def chat(load, proxy, multiline, timeout, verify, model):
 @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
 @click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
 @click.option("-dd", "--download-directory", help="Specify custom download directory")
-@click.option("-b", "--backend", default="api", type=click.Choice(["api", "html", "lite"]), help="which backend to use")
+@click.option("-b", "--backend", default="auto", type=click.Choice(["auto", "api", "html", "lite"]))
 @click.option("-th", "--threads", default=10, help="download threads, default=10")
 @click.option("-p", "--proxy", help="the proxy to send requests, example: socks5://127.0.0.1:9150")
 @click.option("-v", "--verify", default=True, help="verify SSL when making the request")
diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
index b6d09f0..9242fe8 100644
--- a/duckduckgo_search/duckduckgo_search.py
+++ b/duckduckgo_search/duckduckgo_search.py
@@ -6,21 +6,15 @@
 from datetime import datetime, timezone
 from functools import cached_property
 from itertools import cycle
-from random import choice
+from random import choice, shuffle
 from time import sleep, time
 from types import TracebackType
 from typing import cast
 
 import primp  # type: ignore
-
-try:
-    from lxml.etree import _Element
-    from lxml.html import HTMLParser as LHTMLParser
-    from lxml.html import document_fromstring
-
-    LXML_AVAILABLE = True
-except ImportError:
-    LXML_AVAILABLE = False
+from lxml.etree import _Element
+from lxml.html import HTMLParser as LHTMLParser
+from lxml.html import document_fromstring
 
 from .exceptions import ConversationLimitException, DuckDuckGoSearchException, RatelimitException, TimeoutException
 from .utils import (
@@ -213,7 +207,7 @@ def text(
         region: str = "wt-wt",
         safesearch: str = "moderate",
         timelimit: str | None = None,
-        backend: str = "api",
+        backend: str = "auto",
         max_results: int | None = None,
     ) -> list[dict[str, str]]:
         """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
@@ -223,7 +217,8 @@ def text(
             region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
             safesearch: on, moderate, off. Defaults to "moderate".
             timelimit: d, w, m, y. Defaults to None.
-            backend: api, html, lite. Defaults to api.
+            backend: auto, api, html, lite. Defaults to auto.
+                auto - try all backends in random order,
                 api - collect data from https://duckduckgo.com,
                 html - collect data from https://html.duckduckgo.com,
                 lite - collect data from https://lite.duckduckgo.com.
@@ -237,17 +232,25 @@ def text(
             RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits.
             TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts.
         """
-        if LXML_AVAILABLE is False and backend != "api":
-            backend = "api"
-            warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
-
-        if backend == "api":
-            results = self._text_api(keywords, region, safesearch, timelimit, max_results)
-        elif backend == "html":
-            results = self._text_html(keywords, region, timelimit, max_results)
-        elif backend == "lite":
-            results = self._text_lite(keywords, region, timelimit, max_results)
-        return results
+
+        backends = ["api", "html", "lite"] if backend == "auto" else [backend]
+        shuffle(backends)
+
+        results, err = [], None
+        for b in backends:
+            try:
+                if b == "api":
+                    results = self._text_api(keywords, region, safesearch, timelimit, max_results)
+                elif b == "html":
+                    results = self._text_html(keywords, region, timelimit, max_results)
+                elif b == "lite":
+                    results = self._text_lite(keywords, region, timelimit, max_results)
+                return results
+            except Exception as ex:
+                logger.info(f"Error to search using {b} backend: {ex}")
+                err = ex
+
+        raise DuckDuckGoSearchException(err)
 
     def _text_api(
         self,
diff --git a/pyproject.toml b/pyproject.toml
index 262f180..b6ab9b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ classifiers = [
 dependencies = [
     "click>=8.1.7",
     "primp>=0.9.1",
+    "lxml>=5.3.0",
 ]
 dynamic = ["version"]
 
@@ -44,9 +45,6 @@ ddgs = "duckduckgo_search.cli:safe_entry_point"
 version = {attr = "duckduckgo_search.version.__version__"}
 
 [project.optional-dependencies]
-lxml = [
-    "lxml>=5.3.0",
-]
 dev = [
     "mypy>=1.13.0",
     "pytest>=8.3.4",

From ee1dce3cb4d1101b4109cdae28bc0842dda66cbb Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sun, 22 Dec 2024 11:07:15 +0300
Subject: [PATCH 6/7] tests: update

---
 pyproject.toml    |  1 +
 tests/test_cli.py | 66 ++++++++++++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b6ab9b7..6d86859 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ version = {attr = "duckduckgo_search.version.__version__"}
 dev = [
     "mypy>=1.13.0",
     "pytest>=8.3.4",
+    "pytest-dependency>=0.6.0",
     "ruff>=0.8.3",
 ]
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b072562..2b88e29 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,4 +1,5 @@
 import os
+import pathlib
 import shutil
 import time
 
@@ -9,7 +10,8 @@
 from duckduckgo_search.cli import _download_results, _save_csv, _save_json, cli
 
 runner = CliRunner()
-
+TEXT_RESULTS = None
+IMAGES_RESULTS = None
 
 @pytest.fixture(autouse=True)
 def pause_between_tests():
@@ -46,43 +48,49 @@ def test_videos_command():
     assert "title" in result.output
 
 
-def test_save_csv(tmp_path):
-    keywords = "cat"
-    with DDGS() as ddgs:
-        results = ddgs.text(keywords, max_results=10)
-        assert 5 <= len(results) <= 10
+@pytest.mark.dependency()
+def test_get_text():
+    global TEXT_RESULTS
+    TEXT_RESULTS = DDGS().text("test")
+    assert TEXT_RESULTS
+
+
+@pytest.mark.dependency()
+def test_get_images():
+    global IMAGES_RESULTS
+    IMAGES_RESULTS = DDGS().images("test")
+    assert IMAGES_RESULTS
+
 
-    temp_file = tmp_path / f"{keywords}.csv"
-    _save_csv(temp_file, results)
+@pytest.mark.dependency(depends=["test_get_data"])
+def test_save_csv(tmp_path):
+    temp_file = tmp_path / "test_csv.csv"
+    _save_csv(temp_file, RESULTS)
     assert temp_file.exists()
 
 
+@pytest.mark.dependency(depends=["test_get_data"])
 def test_save_json(tmp_path):
-    keywords = "dog"
-    with DDGS() as ddgs:
-        results = ddgs.text(keywords, max_results=10)
-        assert 5 <= len(results) <= 10
-
-    temp_file = tmp_path / f"{keywords}.json"
-    _save_json(temp_file, results)
+    temp_file = tmp_path / "test_json.json"
+    _save_json(temp_file, RESULTS)
     assert temp_file.exists()
 
 
+@pytest.mark.dependency(depends=["test_get_data"])
 def test_text_download():
-    keywords = "sea"
-    with DDGS() as ddgs:
-        results = ddgs.text(keywords, max_results=8)
-    assert 5 <= len(results) <= 8
-
-    _download_results(keywords, results, function_name="text", pathname="text_downloads")
-    shutil.rmtree("text_downloads")
+    pathname = pathlib.Path("text_downloads")
+    _download_results(test_text_download, TEXT_RESULTS, function_name="text", pathname=str(pathname))
+    assert pathname.is_dir() and pathname.iterdir()
+    for file in pathname.iterdir():
+        assert file.is_file()
+    shutil.rmtree(str(pathname))
 
 
+@pytest.mark.dependency(depends=["test_get_images"])
 def test_images_download():
-    keywords = "sky"
-    with DDGS() as ddgs:
-        results = ddgs.images(keywords, max_results=8)
-    assert len(results) >= 8
-
-    _download_results(keywords, results, function_name="images", pathname="images_downloads")
-    shutil.rmtree("images_downloads")
+    pathname = pathlib.Path("images_downloads")
+    _download_results(test_images_download, IMAGES_RESULTS, function_name="images", pathname=str(pathname))
+    assert pathname.is_dir() and pathname.iterdir()
+    for file in pathname.iterdir():
+        assert file.is_file()
+    shutil.rmtree(str(pathname))

From da91fff5bb3e81031623109d92f8d608fdc9bb85 Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sun, 22 Dec 2024 11:10:00 +0300
Subject: [PATCH 7/7] Bugfix iterating if max_results is None

---
 duckduckgo_search/duckduckgo_search.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
index 9242fe8..712db94 100644
--- a/duckduckgo_search/duckduckgo_search.py
+++ b/duckduckgo_search/duckduckgo_search.py
@@ -299,7 +299,7 @@ def _text_api(
                             return results
                 else:
                     next_page_url = row.get("n")
-                    if not next_page_url:
+                    if not next_page_url or not max_results:
                         return results
                     payload["s"] = next_page_url.split("s=")[1].split("&")[0]
         return results
@@ -365,7 +365,7 @@ def _text_html(
                             return results
 
             npx = tree.xpath('.//div[@class="nav-link"]')
-            if not npx:
+            if not npx or not max_results:
                 return results
             next_page = npx[-1] if isinstance(npx, list) else None
             if isinstance(next_page, _Element):
@@ -447,7 +447,7 @@ def _text_lite(
                                 return results
 
             next_page_s = tree.xpath("//form[./input[contains(@value, 'ext')]]/input[@name='s']/@value")
-            if not next_page_s:
+            if not next_page_s or not max_results:
                 return results
             elif isinstance(next_page_s, list):
                 payload["s"] = str(next_page_s[0])
@@ -539,7 +539,7 @@ def images(
                     if max_results and len(results) >= max_results:
                         return results
             next = resp_json.get("next")
-            if next is None:
+            if next is None or not max_results:
                 return results
             payload["s"] = next.split("s=")[-1].split("&")[0]
 
@@ -609,7 +609,7 @@ def videos(
                     if max_results and len(results) >= max_results:
                         return results
             next = resp_json.get("next")
-            if next is None:
+            if next is None or not max_results:
                 return results
             payload["s"] = next.split("s=")[-1].split("&")[0]
 
@@ -681,7 +681,7 @@ def news(
                         return results
 
             next = resp_json.get("next")
-            if next is None:
+            if next is None or not max_results:
                 return results
             payload["s"] = next.split("s=")[-1].split("&")[0]