Merge pull request #81 from deedy5/v3.5.0

v3.5.0 1. DDGS().text() - add new parameter backend: api - collect data from duckduckgo.com, html - collect data from html.duckduckgo.com, lite - collect data lite.duckduckgo.com. 2. DDGS().text(backend='api') and DDGS().images() - add sleep(0.75) after receiving the vqd. 3. CLI - ddgs text - add -b (--backend) parameter. 4. add tests for DDGS().text(backend='html') and DDGS().text(backend='lite').
deedy5 · May 26, 2023 · 3ca7058 · 3ca7058
2 parents e7071db + bec968e
commit 3ca7058
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -173,7 +173,6 @@ for r in ddgs_text_gen:
 [Go To TOP](#TOP)
 
 ## 1. text() - text search by by duckduckgo.com
-*WARNING!: Since version v3.4.0, api requests have been replaced by html parsing. Set a delay of at least 1 second between function calls.*
 ```python
 def text(
     keywords: str,
@@ -188,7 +187,10 @@ def text(
         region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
         safesearch: on, moderate, off. Defaults to "moderate".
         timelimit: d, w, m, y. Defaults to None.
-
+        backend: api, html, lite. Defaults to api.
+            api - collect data from https://duckduckgo.com,
+            html - collect data from https://html.duckduckgo.com,
+            lite - collect data from https://lite.duckduckgo.com.
     Yields:
         dict with search results.
 
@@ -200,7 +202,7 @@ from duckduckgo_search import DDGS
 
 ddgs = DDGS()
 
-keywords = 'Bella Ciao'
+keywords = 'live free or die'
 ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
 for r in ddgs_text_gen:
 	print(r)
@@ -210,6 +212,13 @@ keywords = 'russia filetype:pdf'
 ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
 for r in ddgs_text_gen:
 	print(r)
+
+# Using lite backend and limit the number of results to 10
+from itertools import islice
+
+ddgs_text_gen = DDGS().text("notes from a dead house", backend="lite")
+for r in islice(ddgs_text_gen, 10):
+	print(r)
 ```
 
 

diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
@@ -146,7 +146,7 @@ def version():
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="text search, keywords for query")
+@click.option("-k", "--keywords", required=True, help="text search, keywords for query")
 @click.option(
     "-r",
     "--region",
@@ -186,6 +186,13 @@ def version():
     default=False,
     help="download results to 'keywords' folder",
 )
+@click.option(
+    "-b",
+    "--backend",
+    default="api",
+    type=click.Choice(["api", "html", "lite"]),
+    help="which backend to use, default=api",
+)
 def text(keywords, output, download, max_results, *args, **kwargs):
     data = []
     for r in DDGS().text(keywords=keywords, *args, **kwargs):
@@ -205,7 +212,9 @@ def text(keywords, output, download, max_results, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="answers search, keywords for query")
+@click.option(
+    "-k", "--keywords", required=True, help="answers search, keywords for query"
+)
 @click.option(
     "-o",
     "--output",
@@ -226,7 +235,7 @@ def answers(keywords, output, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="keywords for query")
+@click.option("-k", "--keywords", required=True, help="keywords for query")
 @click.option(
     "-r",
     "--region",
@@ -333,7 +342,7 @@ def images(keywords, output, download, max_results, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="keywords for query")
+@click.option("-k", "--keywords", required=True, help="keywords for query")
 @click.option(
     "-r",
     "--region",
@@ -397,7 +406,7 @@ def videos(keywords, output, max_results, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="keywords for query")
+@click.option("-k", "--keywords", required=True, help="keywords for query")
 @click.option(
     "-r",
     "--region",
@@ -446,7 +455,7 @@ def news(keywords, output, max_results, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="keywords for query")
+@click.option("-k", "--keywords", required=True, help="keywords for query")
 @click.option(
     "-p",
     "--place",
@@ -504,7 +513,7 @@ def maps(keywords, output, max_results, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="text for translation")
+@click.option("-k", "--keywords", required=True, help="text for translation")
 @click.option(
     "-f",
     "--from_",
@@ -535,7 +544,7 @@ def translate(keywords, output, *args, **kwargs):
 
 
 @cli.command()
-@click.option("-k", "--keywords", help="keywords for query")
+@click.option("-k", "--keywords", required=True, help="keywords for query")
 @click.option(
     "-r",
     "--region",

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -5,6 +5,7 @@
 from datetime import datetime
 from decimal import Decimal
 from html import unescape
+from itertools import cycle
 from time import sleep
 from typing import Deque, Dict, Iterator, Optional, Set
 from urllib.parse import unquote
@@ -98,13 +99,42 @@ def _normalize(self, raw_html: str) -> str:
             return unescape(re.sub(REGEX_STRIP_TAGS, "", raw_html))
         return ""
 
-    '''
     def text(
         self,
         keywords: str,
         region: str = "wt-wt",
         safesearch: str = "moderate",
         timelimit: Optional[str] = None,
+        backend: str = "api",
+    ) -> Iterator[dict]:
+        """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params
+
+        Args:
+            keywords: keywords for query.
+            region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
+            safesearch: on, moderate, off. Defaults to "moderate".
+            timelimit: d, w, m, y. Defaults to None.
+            backend: api, html, lite. Defaults to api.
+                api - collect data from https://duckduckgo.com,
+                html - collect data from https://html.duckduckgo.com,
+                lite - collect data from https://lite.duckduckgo.com.
+        Yields:
+            dict with search results.
+
+        """
+        if backend == "api":
+            yield from self._text_api(keywords, region, safesearch, timelimit)
+        elif backend == "html":
+            yield from self._text_html(keywords, region, safesearch, timelimit)
+        elif backend == "lite":
+            yield from self._text_lite(keywords, region, timelimit)
+
+    def _text_api(
+        self,
+        keywords: str,
+        region: str = "wt-wt",
+        safesearch: str = "moderate",
+        timelimit: Optional[str] = None,
     ) -> Iterator[dict]:
         """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params
 
@@ -122,6 +152,7 @@ def text(
 
         vqd = self._get_vqd(keywords)
         assert vqd, "error in getting vqd"
+        sleep(0.75)
 
         payload = {
             "q": keywords,  #
@@ -144,7 +175,8 @@ def text(
             payload["p"] = "1"
 
         cache = set()
-        for _ in range(10):
+        for s in ("0", "20", "70", "120"):
+            payload["s"] = s
             resp = self._get_url(
                 "GET", "https://links.duckduckgo.com/d.js", params=payload
             )
@@ -157,10 +189,7 @@ def text(
             if page_data is None:
                 break
 
-            result_exists = False
             for row in page_data:
-                if "n" in row:
-                    payload["s"] = row["n"].split("s=")[-1].split("&")[0]
                 href = row.get("u", None)
                 if (
                     href
@@ -170,19 +199,13 @@ def text(
                     cache.add(href)
                     body = self._normalize(row["a"])
                     if body:
-                        result_exists = True
                         yield {
                             "title": self._normalize(row["t"]),
                             "href": href,
                             "body": body,
                         }
-                elif result_exists is False:
-                    break
-            if result_exists is False:
-                break
-    '''
 
-    def text(
+    def _text_html(
         self,
         keywords: str,
         region: str = "wt-wt",
@@ -206,7 +229,7 @@ def text(
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
             "q": keywords,
-            "l": region,
+            "kl": region,
             "p": safesearch_base[safesearch.lower()],
             "df": timelimit,
         }
@@ -217,10 +240,11 @@ def text(
             )
             if resp is None:
                 break
+
             tree = html.fromstring(resp.content)
             if tree.xpath('//div[@class="no-results"]/text()'):
                 return
-            result_exists = False
+
             for e in tree.xpath('//div[contains(@class, "results_links")]'):
                 href = e.xpath('.//a[contains(@class, "result__a")]/@href')
                 href = href[0] if href else None
@@ -232,16 +256,12 @@ def text(
                     cache.add(href)
                     title = e.xpath('.//a[contains(@class, "result__a")]/text()')
                     body = e.xpath('.//a[contains(@class, "result__snippet")]//text()')
-                    result_exists = True
                     yield {
                         "title": self._normalize(title[0]) if title else None,
                         "href": href,
                         "body": self._normalize("".join(body)) if body else None,
                     }
 
-            if result_exists is False:
-                break
-
             next_page = tree.xpath('.//div[@class="nav-link"]')
             next_page = next_page[-1] if next_page else None
             if next_page is None:
@@ -250,7 +270,70 @@ def text(
             names = next_page.xpath('.//input[@type="hidden"]/@name')
             values = next_page.xpath('.//input[@type="hidden"]/@value')
             payload = {n: v for n, v in zip(names, values)}
-            sleep(1)
+            sleep(0.75)
+
+    def _text_lite(
+        self,
+        keywords: str,
+        region: str = "wt-wt",
+        timelimit: Optional[str] = None,
+    ) -> Iterator[dict]:
+        """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params
+
+        Args:
+            keywords: keywords for query.
+            region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
+            timelimit: d, w, m, y. Defaults to None.
+
+        Yields:
+            dict with search results.
+
+        """
+        assert keywords, "keywords is mandatory"
+
+        payload = {
+            "q": keywords,
+            "kl": region,
+            "df": timelimit,
+        }
+        cache: Set[str] = set()
+        for s in ("0", "20", "70", "120"):
+            payload["s"] = s
+            resp = self._get_url(
+                "POST", "https://lite.duckduckgo.com/lite/", data=payload
+            )
+            if resp is None:
+                break
+
+            tree = html.fromstring(resp.content)
+            if "No more results." in tree.xpath("//table[1]//text()"):
+                return
+
+            result_exists = False
+            for i, e in zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr")):
+                if i == 1:
+                    href = e.xpath(".//a//@href")
+                    href = href[0] if href else None
+                    if (
+                        href is None
+                        or href in cache
+                        or href == f"http://www.google.com/search?q={keywords}"
+                    ):
+                        continue
+                    title = e.xpath(".//a//text()")[0]
+                elif i == 2:
+                    body = e.xpath(".//td[@class='result-snippet']//text()")
+                    body = "".join(body).strip()
+                elif i == 3:
+                    result_exists = True
+                    yield {
+                        "href": href,
+                        "title": title,
+                        "body": body,
+                    }
+            if result_exists is False:
+                break
+            sleep(0.75)
 
     def images(
         self,
@@ -290,6 +373,7 @@ def images(
 
         vqd = self._get_vqd(keywords)
         assert vqd, "error in getting vqd"
+        sleep(0.75)
 
         safesearch_base = {"on": 1, "moderate": 1, "off": -1}
         timelimit = f"time:{timelimit}" if timelimit else ""

diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py
@@ -1 +1 @@
-__version__ = "3.4.1"
+__version__ = "3.5.0"
diff --git a/tests/test_duckduckgo_search.py b/tests/test_duckduckgo_search.py
@@ -14,6 +14,24 @@ def test_text():
             break
     assert counter >= 25
 
+def test_text_html():
+    results_gen = DDGS().text("cat", backend="html")
+    counter = 0
+    for i, x in enumerate(results_gen):
+        counter += 1
+        if i >= 25:
+            break
+    assert counter >= 25
+
+def test_text_lite():
+    results_gen = DDGS().text("cat", backend="lite")
+    counter = 0
+    for i, x in enumerate(results_gen):
+        counter += 1
+        if i >= 25:
+            break
+    assert counter >= 25
+
 
 def test_images():
     results_gen = DDGS().images("cat")