V3.9.10 (#146)

1) text(backend="api"): extract json from html, 2) bugfix text(backend="html"): update Referer in headers, 3) removed tests for text(backend="lite") - works only with proxies.
deedy5 · Dec 7, 2023 · f55675a · f55675a
1 parent 2deb5bb
commit f55675a
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 26 deletions.
diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -1,3 +1,4 @@
+import json
 import logging
 from collections import deque
 from datetime import datetime, timezone
@@ -12,7 +13,7 @@
 
 from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
 from .models import MapsResult
-from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url
+from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
 
 logger = logging.getLogger(__name__)
 
@@ -140,7 +141,7 @@ def _text_api(
             "s": "0",
             "df": timelimit,
             "vqd": vqd,
-            "o": "json",
+            # "o": "json",
             "sp": "0",
         }
         safesearch = safesearch.lower()
@@ -157,10 +158,7 @@ def _text_api(
             if resp is None:
                 return
 
-            try:
-                page_data = resp.json().get("results", None)
-            except Exception:
-                return
+            page_data = _text_extract_json(resp.content)
             if page_data is None:
                 return
 
@@ -207,13 +205,15 @@ def _text_html(
         """
         assert keywords, "keywords is mandatory"
 
+        self._client.headers["Referer"] = "https://html.duckduckgo.com/"
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
             "q": keywords,
             "s": "0",
             "kl": region,
             "p": safesearch_base[safesearch.lower()],
             "df": timelimit,
+            "b": "",
         }
         cache: Set[str] = set()
         for _ in range(11):

diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py
@@ -12,7 +12,7 @@
 
 from .exceptions import APIException, DuckDuckGoSearchException, HTTPException, RateLimitException, TimeoutException
 from .models import MapsResult
-from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url
+from .utils import HEADERS, USERAGENTS, _extract_vqd, _is_500_in_url, _normalize, _normalize_url, _text_extract_json
 
 logger = logging.getLogger(__name__)
 
@@ -141,7 +141,7 @@ async def _text_api(
             "s": "0",
             "df": timelimit,
             "vqd": vqd,
-            "o": "json",
+            # "o": "json",
             "sp": "0",
         }
         safesearch = safesearch.lower()
@@ -158,14 +158,11 @@ async def _text_api(
             if resp is None:
                 return
 
-            try:
-                page_data = resp.json().get("results", None)
-            except Exception:
-                return
+            page_data = _text_extract_json(resp.content)
             if page_data is None:
                 return
 
-            result_exists = False
+            result_exists, next_page_url = False, None
             for row in page_data:
                 href = row.get("u", None)
                 if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
@@ -208,6 +205,7 @@ async def _text_html(
         """
         assert keywords, "keywords is mandatory"
 
+        self._client.headers["Referer"] = "https://html.duckduckgo.com/"
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
             "q": keywords,

diff --git a/duckduckgo_search/utils.py b/duckduckgo_search/utils.py
@@ -1,3 +1,4 @@
+import json
 import re
 from html import unescape
 from typing import Optional
@@ -38,6 +39,17 @@ def _extract_vqd(html_bytes: bytes, keywords: str) -> Optional[str]:
     raise VQDExtractionException(f"Could not extract vqd. {keywords=}")
 
 
+def _text_extract_json(html_bytes: bytes) -> Optional[str]:
+    """text(backend="api") -> extract json from html"""
+    try:
+        start = html_bytes.index(b"DDG.pageLayout.load('d',") + 24
+        end = html_bytes.index(b");DDG.duckbar.load(", start)
+        data = html_bytes[start:end]
+        return json.loads(data)
+    except ValueError:
+        pass
+
+
 def _is_500_in_url(url: str) -> bool:
     """something like '506-00.js' inside the url"""
     return bool(REGEX_500_IN_URL.search(url))

diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py
@@ -1 +1 @@
-__version__ = "3.9.9"
+__version__ = "3.9.10"
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -14,7 +14,7 @@
 @pytest.fixture(autouse=True)
 def slow_down_tests():
     yield
-    sleep(1)
+    sleep(2)
 
 
 def test_version_command():

diff --git a/tests/test_duckduckgo_search.py b/tests/test_duckduckgo_search.py
@@ -6,7 +6,7 @@
 @pytest.fixture(autouse=True)
 def slow_down_tests():
     yield
-    sleep(1)
+    sleep(2)
 
 
 def test_text():
@@ -27,10 +27,10 @@ def test_text_html():
         assert len(results) == 30
 
 
-def test_text_lite():
-    with DDGS() as ddgs:
-        results = [x for x in ddgs.text("dog", backend="lite", max_results=30)]
-        assert len(results) == 30
+# def test_text_lite():
+#     with DDGS() as ddgs:
+#         results = [x for x in ddgs.text("dog", backend="lite", max_results=30)]
+#         assert len(results) == 30
 
 
 def test_images():

diff --git a/tests/test_duckduckgo_search_async.py b/tests/test_duckduckgo_search_async.py
@@ -7,7 +7,7 @@
 @pytest.fixture(autouse=True)
 def slow_down_tests():
     yield
-    sleep(1)
+    sleep(2)
 
 
 @pytest.mark.asyncio
@@ -31,11 +31,11 @@ async def test_text_html():
         assert len(results) == 30
 
 
-@pytest.mark.asyncio
-async def test_text_lite():
-    async with AsyncDDGS() as ddgs:
-        results = [x async for x in ddgs.text("dog", backend="lite", max_results=30)]
-        assert len(results) == 30
+# @pytest.mark.asyncio
+# async def test_text_lite():
+#     async with AsyncDDGS() as ddgs:
+#         results = [x async for x in ddgs.text("dog", backend="lite", max_results=30)]
+#         assert len(results) == 30
 
 
 @pytest.mark.asyncio