Skip to content

Commit

Permalink
Merge pull request #81 from deedy5/v3.5.0
Browse files Browse the repository at this point in the history
v3.5.0
1. DDGS().text() - add new parameter backend:
        api - collect data from duckduckgo.com,
        html - collect data from html.duckduckgo.com,
        lite - collect data lite.duckduckgo.com.
2. DDGS().text(backend='api') and DDGS().images() - add sleep(0.75) after receiving the vqd.
3. CLI - ddgs text - add -b (--backend) parameter.
4. add tests for DDGS().text(backend='html') and DDGS().text(backend='lite').
  • Loading branch information
deedy5 authored May 26, 2023
2 parents e7071db + bec968e commit 3ca7058
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 31 deletions.
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ for r in ddgs_text_gen:
[Go To TOP](#TOP)

## 1. text() - text search by by duckduckgo.com
*WARNING!: Since version v3.4.0, api requests have been replaced by html parsing. Set a delay of at least 1 second between function calls.*
```python
def text(
keywords: str,
Expand All @@ -188,7 +187,10 @@ def text(
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
safesearch: on, moderate, off. Defaults to "moderate".
timelimit: d, w, m, y. Defaults to None.
backend: api, html, lite. Defaults to api.
api - collect data from https://duckduckgo.com,
html - collect data from https://html.duckduckgo.com,
lite - collect data from https://lite.duckduckgo.com.
Yields:
dict with search results.
Expand All @@ -200,7 +202,7 @@ from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'Bella Ciao'
keywords = 'live free or die'
ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
for r in ddgs_text_gen:
print(r)
Expand All @@ -210,6 +212,13 @@ keywords = 'russia filetype:pdf'
ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
for r in ddgs_text_gen:
print(r)

# Using lite backend and limit the number of results to 10
from itertools import islice

ddgs_text_gen = DDGS().text("notes from a dead house", backend="lite")
for r in islice(ddgs_text_gen, 10):
print(r)
```


Expand Down
25 changes: 17 additions & 8 deletions duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def version():


@cli.command()
@click.option("-k", "--keywords", help="text search, keywords for query")
@click.option("-k", "--keywords", required=True, help="text search, keywords for query")
@click.option(
"-r",
"--region",
Expand Down Expand Up @@ -186,6 +186,13 @@ def version():
default=False,
help="download results to 'keywords' folder",
)
@click.option(
"-b",
"--backend",
default="api",
type=click.Choice(["api", "html", "lite"]),
help="which backend to use, default=api",
)
def text(keywords, output, download, max_results, *args, **kwargs):
data = []
for r in DDGS().text(keywords=keywords, *args, **kwargs):
Expand All @@ -205,7 +212,9 @@ def text(keywords, output, download, max_results, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="answers search, keywords for query")
@click.option(
"-k", "--keywords", required=True, help="answers search, keywords for query"
)
@click.option(
"-o",
"--output",
Expand All @@ -226,7 +235,7 @@ def answers(keywords, output, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="keywords for query")
@click.option("-k", "--keywords", required=True, help="keywords for query")
@click.option(
"-r",
"--region",
Expand Down Expand Up @@ -333,7 +342,7 @@ def images(keywords, output, download, max_results, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="keywords for query")
@click.option("-k", "--keywords", required=True, help="keywords for query")
@click.option(
"-r",
"--region",
Expand Down Expand Up @@ -397,7 +406,7 @@ def videos(keywords, output, max_results, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="keywords for query")
@click.option("-k", "--keywords", required=True, help="keywords for query")
@click.option(
"-r",
"--region",
Expand Down Expand Up @@ -446,7 +455,7 @@ def news(keywords, output, max_results, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="keywords for query")
@click.option("-k", "--keywords", required=True, help="keywords for query")
@click.option(
"-p",
"--place",
Expand Down Expand Up @@ -504,7 +513,7 @@ def maps(keywords, output, max_results, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="text for translation")
@click.option("-k", "--keywords", required=True, help="text for translation")
@click.option(
"-f",
"--from_",
Expand Down Expand Up @@ -535,7 +544,7 @@ def translate(keywords, output, *args, **kwargs):


@cli.command()
@click.option("-k", "--keywords", help="keywords for query")
@click.option("-k", "--keywords", required=True, help="keywords for query")
@click.option(
"-r",
"--region",
Expand Down
122 changes: 103 additions & 19 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from datetime import datetime
from decimal import Decimal
from html import unescape
from itertools import cycle
from time import sleep
from typing import Deque, Dict, Iterator, Optional, Set
from urllib.parse import unquote
Expand Down Expand Up @@ -98,13 +99,42 @@ def _normalize(self, raw_html: str) -> str:
return unescape(re.sub(REGEX_STRIP_TAGS, "", raw_html))
return ""

'''
def text(
self,
keywords: str,
region: str = "wt-wt",
safesearch: str = "moderate",
timelimit: Optional[str] = None,
backend: str = "api",
) -> Iterator[dict]:
"""DuckDuckGo text search generator. Query params: https://duckduckgo.com/params
Args:
keywords: keywords for query.
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
safesearch: on, moderate, off. Defaults to "moderate".
timelimit: d, w, m, y. Defaults to None.
backend: api, html, lite. Defaults to api.
api - collect data from https://duckduckgo.com,
html - collect data from https://html.duckduckgo.com,
lite - collect data from https://lite.duckduckgo.com.
Yields:
dict with search results.
"""
if backend == "api":
yield from self._text_api(keywords, region, safesearch, timelimit)
elif backend == "html":
yield from self._text_html(keywords, region, safesearch, timelimit)
elif backend == "lite":
yield from self._text_lite(keywords, region, timelimit)

def _text_api(
self,
keywords: str,
region: str = "wt-wt",
safesearch: str = "moderate",
timelimit: Optional[str] = None,
) -> Iterator[dict]:
"""DuckDuckGo text search generator. Query params: https://duckduckgo.com/params
Expand All @@ -122,6 +152,7 @@ def text(

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"
sleep(0.75)

payload = {
"q": keywords, #
Expand All @@ -144,7 +175,8 @@ def text(
payload["p"] = "1"

cache = set()
for _ in range(10):
for s in ("0", "20", "70", "120"):
payload["s"] = s
resp = self._get_url(
"GET", "https://links.duckduckgo.com/d.js", params=payload
)
Expand All @@ -157,10 +189,7 @@ def text(
if page_data is None:
break

result_exists = False
for row in page_data:
if "n" in row:
payload["s"] = row["n"].split("s=")[-1].split("&")[0]
href = row.get("u", None)
if (
href
Expand All @@ -170,19 +199,13 @@ def text(
cache.add(href)
body = self._normalize(row["a"])
if body:
result_exists = True
yield {
"title": self._normalize(row["t"]),
"href": href,
"body": body,
}
elif result_exists is False:
break
if result_exists is False:
break
'''

def text(
def _text_html(
self,
keywords: str,
region: str = "wt-wt",
Expand All @@ -206,7 +229,7 @@ def text(
safesearch_base = {"on": 1, "moderate": -1, "off": -2}
payload = {
"q": keywords,
"l": region,
"kl": region,
"p": safesearch_base[safesearch.lower()],
"df": timelimit,
}
Expand All @@ -217,10 +240,11 @@ def text(
)
if resp is None:
break

tree = html.fromstring(resp.content)
if tree.xpath('//div[@class="no-results"]/text()'):
return
result_exists = False

for e in tree.xpath('//div[contains(@class, "results_links")]'):
href = e.xpath('.//a[contains(@class, "result__a")]/@href')
href = href[0] if href else None
Expand All @@ -232,16 +256,12 @@ def text(
cache.add(href)
title = e.xpath('.//a[contains(@class, "result__a")]/text()')
body = e.xpath('.//a[contains(@class, "result__snippet")]//text()')
result_exists = True
yield {
"title": self._normalize(title[0]) if title else None,
"href": href,
"body": self._normalize("".join(body)) if body else None,
}

if result_exists is False:
break

next_page = tree.xpath('.//div[@class="nav-link"]')
next_page = next_page[-1] if next_page else None
if next_page is None:
Expand All @@ -250,7 +270,70 @@ def text(
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
sleep(1)
sleep(0.75)

def _text_lite(
self,
keywords: str,
region: str = "wt-wt",
timelimit: Optional[str] = None,
) -> Iterator[dict]:
"""DuckDuckGo text search generator. Query params: https://duckduckgo.com/params
Args:
keywords: keywords for query.
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
timelimit: d, w, m, y. Defaults to None.
Yields:
dict with search results.
"""
assert keywords, "keywords is mandatory"

payload = {
"q": keywords,
"kl": region,
"df": timelimit,
}
cache: Set[str] = set()
for s in ("0", "20", "70", "120"):
payload["s"] = s
resp = self._get_url(
"POST", "https://lite.duckduckgo.com/lite/", data=payload
)
if resp is None:
break

tree = html.fromstring(resp.content)
if "No more results." in tree.xpath("//table[1]//text()"):
return

result_exists = False
for i, e in zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr")):
if i == 1:
href = e.xpath(".//a//@href")
href = href[0] if href else None
if (
href is None
or href in cache
or href == f"http://www.google.com/search?q={keywords}"
):
continue
title = e.xpath(".//a//text()")[0]
elif i == 2:
body = e.xpath(".//td[@class='result-snippet']//text()")
body = "".join(body).strip()
elif i == 3:
result_exists = True
yield {
"href": href,
"title": title,
"body": body,
}
if result_exists is False:
break
sleep(0.75)

def images(
self,
Expand Down Expand Up @@ -290,6 +373,7 @@ def images(

vqd = self._get_vqd(keywords)
assert vqd, "error in getting vqd"
sleep(0.75)

safesearch_base = {"on": 1, "moderate": 1, "off": -1}
timelimit = f"time:{timelimit}" if timelimit else ""
Expand Down
2 changes: 1 addition & 1 deletion duckduckgo_search/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.4.1"
__version__ = "3.5.0"
18 changes: 18 additions & 0 deletions tests/test_duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@ def test_text():
break
assert counter >= 25

def test_text_html():
results_gen = DDGS().text("cat", backend="html")
counter = 0
for i, x in enumerate(results_gen):
counter += 1
if i >= 25:
break
assert counter >= 25

def test_text_lite():
results_gen = DDGS().text("cat", backend="lite")
counter = 0
for i, x in enumerate(results_gen):
counter += 1
if i >= 25:
break
assert counter >= 25


def test_images():
results_gen = DDGS().images("cat")
Expand Down

0 comments on commit 3ca7058

Please sign in to comment.