Skip to content

Commit

Permalink
Merge pull request #87 from deedy5/v3.7.0
Browse files Browse the repository at this point in the history
V3.7.0
    translate() - migrate to httpx
    correct usage socks5 proxies
    random useragent choice
    text(backend="lite") - normalize title and body
    update tests
    cli: create file only after sucsessful response
  • Loading branch information
deedy5 authored May 30, 2023
2 parents 0982937 + 6b0f9e2 commit 0277fad
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 185 deletions.
157 changes: 66 additions & 91 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,30 +139,24 @@ ___

## Using proxy
If you send too many requests the site blocks ip for up to one minute and DDGS will raise an exception.
In this case, you need repeat again after a while or to use a proxy ([httpx documentation](https://www.python-httpx.org/advanced)).
In this case, you need repeat again after a while or to use a proxy ([documentation](https://www.python-httpx.org/advanced/#http-proxying)).
You can set a timeout if the proxy takes a long time to respond (default timeout=10).

*1. The easiest way. Launch the Tor Browser*
```python3
from duckduckgo_search import DDGS

proxies = {
"all://": "socks5h://localhost:9150",
}
ddgs_text_gen = DDGS(proxies=proxies, timeout=20).text("something you need")
for r in ddgs_text_gen:
print(r)
with DDGS(proxies="socks5://localhost:9150", timeout=20) as ddgs:
for r in ddgs.text("something you need"):
print(r)
```
*2. Use any proxy server* (*example with [iproyal residential proxies](https://iproyal.com?r=residential_proxies)*)
```python3
from duckduckgo_search import DDGS

proxies = {
"all://": "https://user:password@geo.iproyal.com:32325",
}
ddgs_text_gen = DDGS(proxies=proxies, timeout=20).text("something you need")
for r in ddgs_text_gen:
print(r)
with DDGS(proxies="socks5://user:password@geo.iproyal.com:32325", timeout=20) as ddgs:
for r in ddgs.text("something you need"):
print(r)
```

[Go To TOP](#TOP)
Expand Down Expand Up @@ -196,25 +190,22 @@ def text(
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'live free or die'
ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
for r in ddgs_text_gen:
print(r)
with DDGS() as ddgs:
for r in ddgs.text('live free or die', region='wt-wt', safesearch='Off', timelimit='y'):
print(r)

# Searching for pdf files
keywords = 'russia filetype:pdf'
ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
for r in ddgs_text_gen:
print(r)
with DDGS() as ddgs:
for r in ddgs.text('russia filetype:pdf', region='wt-wt', safesearch='Off', timelimit='y'):
print(r)

# Using lite backend and limit the number of results to 10
from itertools import islice

ddgs_text_gen = DDGS().text("notes from a dead house", backend="lite")
for r in islice(ddgs_text_gen, 10):
print(r)
with DDGS() as ddgs:
ddgs_gen = ddgs.text("notes from a dead house", backend="lite")
for r in islice(ddgs_gen, 10):
print(r)
```


Expand All @@ -238,12 +229,9 @@ def answers(keywords: str) -> Generator[dict, None, None]:
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'sun'
ddgs_answers_gen = ddgs.answers(keywords)
for r in ddgs_answers_gen:
print(r)
with DDGS() as ddgs:
for r in ddgs.answers("sun"):
print(r)
```

[Go To TOP](#TOP)
Expand Down Expand Up @@ -289,21 +277,20 @@ def images(
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'butterfly'
ddgs_images_gen = ddgs.images(
keywords,
region="wt-wt",
safesearch="Off",
size=None,
color="Monochrome",
type_image=None,
layout=None,
license_image=None,
)
for r in ddgs_images_gen:
print(r)
with DDGS() as ddgs:
keywords = 'butterfly'
ddgs_images_gen = ddgs.images(
keywords,
region="wt-wt",
safesearch="Off",
size=None,
color="Monochrome",
type_image=None,
layout=None,
license_image=None,
)
for r in ddgs_images_gen:
print(r)
```

[Go To TOP](#TOP)
Expand Down Expand Up @@ -340,19 +327,18 @@ def videos(
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'tesla'
ddgs_videos_gen = ddgs.videos(
keywords,
region="wt-wt",
safesearch="Off",
timelimit="w",
resolution="high",
duration="medium",
)
for r in ddgs_videos_gen:
print(r)
with DDGS() as ddgs:
keywords = 'tesla'
ddgs_videos_gen = ddgs.videos(
keywords,
region="wt-wt",
safesearch="Off",
timelimit="w",
resolution="high",
duration="medium",
)
for r in ddgs_videos_gen:
print(r)
```


Expand Down Expand Up @@ -384,17 +370,16 @@ def news(
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'How soon the sun will die'
ddgs_news_gen = ddgs.news(
keywords,
region="wt-wt",
safesearch="Off",
timelimit="m",
)
for r in ddgs_news_gen:
print(r)
with DDGS() as ddgs:
keywords = 'How soon the sun will die'
ddgs_news_gen = ddgs.news(
keywords,
region="wt-wt",
safesearch="Off",
timelimit="m",
)
for r in ddgs_news_gen:
print(r)
```

[Go To TOP](#TOP)
Expand Down Expand Up @@ -440,15 +425,9 @@ def maps(
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'school'
ddgs_maps_gen = ddgs.maps(
keywords,
place="Uganda",
)
for r in ddgs_maps_gen:
print(r)
with DDGS() as ddgs:
for r in ddgs.maps("school", place="Uganda"):
print(r)
```

[Go To TOP](#TOP)
Expand Down Expand Up @@ -477,11 +456,10 @@ def translate(
```python
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'school'
r = ddgs.translate(keywords, to="de")
print(r)
with DDGS() as ddgs:
keywords = 'school'
r = ddgs.translate(keywords, to="de")
print(r)
```

[Go To TOP](#TOP)
Expand All @@ -508,12 +486,9 @@ def suggestions(
```python3
from duckduckgo_search import DDGS

ddgs = DDGS()

keywords = 'fly'
ddgs_suggestions_gen = ddgs.suggestions(keywords)
for r in ddgs_suggestions_gen:
print(r)
with DDGS() as ddgs:
for r in ddgs.suggestions("fly)
print(r)
```

[Go To TOP](#TOP)
16 changes: 9 additions & 7 deletions duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from shutil import copyfileobj
from random import choice
from urllib.parse import unquote

import click
import httpx

# isort: off
from .duckduckgo_search import DDGS
from .duckduckgo_search import DDGS, USERAGENTS
from .version import __version__

# isort: on
Expand Down Expand Up @@ -77,23 +77,25 @@ def print_data(data):

def sanitize_keywords(keywords):
keywords = (
keywords.replace(" filetype:", "_")
keywords.replace("filetype", "")
.replace(":", "")
.replace('"', "'")
.replace("site:", "")
.replace("site", "")
.replace(" ", "_")
.replace("/", "_")
.replace("\\", "_")
.replace(" ", "")
)
return keywords


def download_file(url, dir_path, filename):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"User-Agent": choice(USERAGENTS),
}
try:
with open(os.path.join(dir_path, filename), "wb") as file:
with httpx.stream("GET", url, headers=headers) as resp:
with httpx.stream("GET", url, headers=headers) as resp:
with open(os.path.join(dir_path, filename), "wb") as file:
for chunk in resp.iter_bytes():
file.write(chunk)
logger.info(f"File downloaded {url}")
Expand Down
45 changes: 31 additions & 14 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,37 @@
from decimal import Decimal
from html import unescape
from itertools import cycle
from random import choice
from time import sleep
from typing import Deque, Dict, Iterator, Optional, Set
from typing import Deque, Dict, Iterator, Optional, Set, Union
from urllib.parse import unquote

import httpx
import requests
from lxml import html

logger = logging.getLogger(__name__)

REGEX_500_IN_URL = re.compile(r"[0-9]{3}-[0-9]{2}.js")
REGEX_STRIP_TAGS = re.compile("<.*?>")

USERAGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13.4; rv:109.0) Gecko/20100101 Firefox/113.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
]
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"User-Agent": choice(USERAGENTS),
"Referer": "https://duckduckgo.com/",
}
REGEX_500_IN_URL = re.compile(r"[0-9]{3}-[0-9]{2}.js")
REGEX_STRIP_TAGS = re.compile("<.*?>")


@dataclass
Expand All @@ -46,7 +61,7 @@ class DDGS:
def __init__(
self,
headers: Optional[Dict[str, str]] = None,
proxies: Optional[Dict] = None,
proxies: Optional[Union[Dict, str]] = None,
timeout: int = 10,
) -> None:
self._client = httpx.Client(
Expand All @@ -67,7 +82,9 @@ def _get_url(
) -> Optional[httpx._models.Response]:
for i in range(3):
try:
resp = self._client.request(method, url, **kwargs)
resp = self._client.request(
method, url, follow_redirects=True, **kwargs
)
if self._is_500_in_url(str(resp.url)) or resp.status_code == 202:
raise httpx._exceptions.HTTPError("")
resp.raise_for_status()
Expand Down Expand Up @@ -335,9 +352,9 @@ def _text_lite(
elif i == 3:
result_exists = True
yield {
"title": self._normalize(title),
"href": href,
"title": title,
"body": body,
"body": self._normalize(body),
}
if result_exists is False:
break
Expand Down Expand Up @@ -668,9 +685,7 @@ def suggestions(
"q": keywords,
"kl": region,
}
resp = self._get_url(
"GET", "https://duckduckgo.com/ac", params=payload, follow_redirects=True
)
resp = self._get_url("GET", "https://duckduckgo.com/ac", params=payload)
if resp is None:
return None
try:
Expand Down Expand Up @@ -858,11 +873,13 @@ def translate(
payload = {
"vqd": vqd,
"query": "translate",
"from": from_,
"to": to,
}
if from_:
payload["from"] = from_

resp = requests.post(
resp = self._get_url(
"POST",
"https://duckduckgo.com/translation.js",
params=payload,
data=keywords.encode(),
Expand Down
2 changes: 1 addition & 1 deletion duckduckgo_search/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.6.0"
__version__ = "3.7.0"
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ classifiers = [
dependencies = [
"click>=8.1.3",
"lxml>=4.9.2",
"httpx[http2]>=0.24.1",
"requests>=2.31.0",
"httpx[http2,socks]>=0.24.1",
]
dynamic = ["version"]

Expand Down
Loading

0 comments on commit 0277fad

Please sign in to comment.