diff --git a/README.md b/README.md index 2f9ade9..ce82eb2 100755 --- a/README.md +++ b/README.md @@ -173,7 +173,6 @@ for r in ddgs_text_gen: [Go To TOP](#TOP) ## 1. text() - text search by by duckduckgo.com -*WARNING!: Since version v3.4.0, api requests have been replaced by html parsing. Set a delay of at least 1 second between function calls.* ```python def text( keywords: str, @@ -188,7 +187,10 @@ def text( region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: d, w, m, y. Defaults to None. - + backend: api, html, lite. Defaults to api. + api - collect data from https://duckduckgo.com, + html - collect data from https://html.duckduckgo.com, + lite - collect data from https://lite.duckduckgo.com. Yields: dict with search results. @@ -200,7 +202,7 @@ from duckduckgo_search import DDGS ddgs = DDGS() -keywords = 'Bella Ciao' +keywords = 'live free or die' ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y') for r in ddgs_text_gen: print(r) @@ -210,6 +212,13 @@ keywords = 'russia filetype:pdf' ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y') for r in ddgs_text_gen: print(r) + +# Using lite backend and limit the number of results to 10 +from itertools import islice + +ddgs_text_gen = DDGS().text("notes from a dead house", backend="lite") +for r in islice(ddgs_text_gen, 10): + print(r) ``` diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py index eefa741..e5cf015 100644 --- a/duckduckgo_search/cli.py +++ b/duckduckgo_search/cli.py @@ -146,7 +146,7 @@ def version(): @cli.command() -@click.option("-k", "--keywords", help="text search, keywords for query") +@click.option("-k", "--keywords", required=True, help="text search, keywords for query") @click.option( "-r", "--region", @@ -186,6 +186,13 @@ def version(): default=False, help="download results to 'keywords' folder", ) +@click.option( + "-b", + "--backend", + default="api", + type=click.Choice(["api", "html", "lite"]), + help="which backend to use, default=api", +) def text(keywords, output, download, max_results, *args, **kwargs): data = [] for r in DDGS().text(keywords=keywords, *args, **kwargs): @@ -205,7 +212,9 @@ def text(keywords, output, download, max_results, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="answers search, keywords for query") +@click.option( + "-k", "--keywords", required=True, help="answers search, keywords for query" +) @click.option( "-o", "--output", @@ -226,7 +235,7 @@ def answers(keywords, output, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="keywords for query") +@click.option("-k", "--keywords", required=True, help="keywords for query") @click.option( "-r", "--region", @@ -333,7 +342,7 @@ def images(keywords, output, download, max_results, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="keywords for query") +@click.option("-k", "--keywords", required=True, help="keywords for query") @click.option( "-r", "--region", @@ -397,7 +406,7 @@ def videos(keywords, output, max_results, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="keywords for query") +@click.option("-k", "--keywords", required=True, help="keywords for query") @click.option( "-r", "--region", @@ -446,7 +455,7 @@ def news(keywords, output, max_results, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="keywords for query") +@click.option("-k", "--keywords", required=True, help="keywords for query") @click.option( "-p", "--place", @@ -504,7 +513,7 @@ def maps(keywords, output, max_results, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="text for translation") +@click.option("-k", "--keywords", required=True, help="text for translation") @click.option( "-f", "--from_", @@ -535,7 +544,7 @@ def translate(keywords, output, *args, **kwargs): @cli.command() -@click.option("-k", "--keywords", help="keywords for query") +@click.option("-k", "--keywords", required=True, help="keywords for query") @click.option( "-r", "--region", diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index 673e38f..dcec516 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -5,6 +5,7 @@ from datetime import datetime from decimal import Decimal from html import unescape +from itertools import cycle from time import sleep from typing import Deque, Dict, Iterator, Optional, Set from urllib.parse import unquote @@ -98,13 +99,42 @@ def _normalize(self, raw_html: str) -> str: return unescape(re.sub(REGEX_STRIP_TAGS, "", raw_html)) return "" - ''' def text( self, keywords: str, region: str = "wt-wt", safesearch: str = "moderate", timelimit: Optional[str] = None, + backend: str = "api", + ) -> Iterator[dict]: + """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params + + Args: + keywords: keywords for query. + region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". + safesearch: on, moderate, off. Defaults to "moderate". + timelimit: d, w, m, y. Defaults to None. + backend: api, html, lite. Defaults to api. + api - collect data from https://duckduckgo.com, + html - collect data from https://html.duckduckgo.com, + lite - collect data from https://lite.duckduckgo.com. + Yields: + dict with search results. + + """ + if backend == "api": + yield from self._text_api(keywords, region, safesearch, timelimit) + elif backend == "html": + yield from self._text_html(keywords, region, safesearch, timelimit) + elif backend == "lite": + yield from self._text_lite(keywords, region, timelimit) + + def _text_api( + self, + keywords: str, + region: str = "wt-wt", + safesearch: str = "moderate", + timelimit: Optional[str] = None, ) -> Iterator[dict]: """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params @@ -122,6 +152,7 @@ def text( vqd = self._get_vqd(keywords) assert vqd, "error in getting vqd" + sleep(0.75) payload = { "q": keywords, # @@ -144,7 +175,8 @@ def text( payload["p"] = "1" cache = set() - for _ in range(10): + for s in ("0", "20", "70", "120"): + payload["s"] = s resp = self._get_url( "GET", "https://links.duckduckgo.com/d.js", params=payload ) @@ -157,10 +189,7 @@ def text( if page_data is None: break - result_exists = False for row in page_data: - if "n" in row: - payload["s"] = row["n"].split("s=")[-1].split("&")[0] href = row.get("u", None) if ( href @@ -170,19 +199,13 @@ def text( cache.add(href) body = self._normalize(row["a"]) if body: - result_exists = True yield { "title": self._normalize(row["t"]), "href": href, "body": body, } - elif result_exists is False: - break - if result_exists is False: - break - ''' - def text( + def _text_html( self, keywords: str, region: str = "wt-wt", @@ -206,7 +229,7 @@ def text( safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { "q": keywords, - "l": region, + "kl": region, "p": safesearch_base[safesearch.lower()], "df": timelimit, } @@ -217,10 +240,11 @@ def text( ) if resp is None: break + tree = html.fromstring(resp.content) if tree.xpath('//div[@class="no-results"]/text()'): return - result_exists = False + for e in tree.xpath('//div[contains(@class, "results_links")]'): href = e.xpath('.//a[contains(@class, "result__a")]/@href') href = href[0] if href else None @@ -232,16 +256,12 @@ def text( cache.add(href) title = e.xpath('.//a[contains(@class, "result__a")]/text()') body = e.xpath('.//a[contains(@class, "result__snippet")]//text()') - result_exists = True yield { "title": self._normalize(title[0]) if title else None, "href": href, "body": self._normalize("".join(body)) if body else None, } - if result_exists is False: - break - next_page = tree.xpath('.//div[@class="nav-link"]') next_page = next_page[-1] if next_page else None if next_page is None: @@ -250,7 +270,70 @@ def text( names = next_page.xpath('.//input[@type="hidden"]/@name') values = next_page.xpath('.//input[@type="hidden"]/@value') payload = {n: v for n, v in zip(names, values)} - sleep(1) + sleep(0.75) + + def _text_lite( + self, + keywords: str, + region: str = "wt-wt", + timelimit: Optional[str] = None, + ) -> Iterator[dict]: + """DuckDuckGo text search generator. Query params: https://duckduckgo.com/params + + Args: + keywords: keywords for query. + region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". + timelimit: d, w, m, y. Defaults to None. + + Yields: + dict with search results. + + """ + assert keywords, "keywords is mandatory" + + payload = { + "q": keywords, + "kl": region, + "df": timelimit, + } + cache: Set[str] = set() + for s in ("0", "20", "70", "120"): + payload["s"] = s + resp = self._get_url( + "POST", "https://lite.duckduckgo.com/lite/", data=payload + ) + if resp is None: + break + + tree = html.fromstring(resp.content) + if "No more results." in tree.xpath("//table[1]//text()"): + return + + result_exists = False + for i, e in zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr")): + if i == 1: + href = e.xpath(".//a//@href") + href = href[0] if href else None + if ( + href is None + or href in cache + or href == f"http://www.google.com/search?q={keywords}" + ): + continue + title = e.xpath(".//a//text()")[0] + elif i == 2: + body = e.xpath(".//td[@class='result-snippet']//text()") + body = "".join(body).strip() + elif i == 3: + result_exists = True + yield { + "href": href, + "title": title, + "body": body, + } + if result_exists is False: + break + sleep(0.75) def images( self, @@ -290,6 +373,7 @@ def images( vqd = self._get_vqd(keywords) assert vqd, "error in getting vqd" + sleep(0.75) safesearch_base = {"on": 1, "moderate": 1, "off": -1} timelimit = f"time:{timelimit}" if timelimit else "" diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py index a5cfdf5..dcbfb52 100755 --- a/duckduckgo_search/version.py +++ b/duckduckgo_search/version.py @@ -1 +1 @@ -__version__ = "3.4.1" +__version__ = "3.5.0" diff --git a/tests/test_duckduckgo_search.py b/tests/test_duckduckgo_search.py index 428083f..9d51ba6 100644 --- a/tests/test_duckduckgo_search.py +++ b/tests/test_duckduckgo_search.py @@ -14,6 +14,24 @@ def test_text(): break assert counter >= 25 +def test_text_html(): + results_gen = DDGS().text("cat", backend="html") + counter = 0 + for i, x in enumerate(results_gen): + counter += 1 + if i >= 25: + break + assert counter >= 25 + +def test_text_lite(): + results_gen = DDGS().text("cat", backend="lite") + counter = 0 + for i, x in enumerate(results_gen): + counter += 1 + if i >= 25: + break + assert counter >= 25 + def test_images(): results_gen = DDGS().images("cat")