Merge pull request #87 from deedy5/v3.7.0

V3.7.0 translate() - migrate to httpx correct usage socks5 proxies random useragent choice text(backend="lite") - normalize title and body update tests cli: create file only after sucsessful response
deedy5 · May 30, 2023 · 0277fad · 0277fad
2 parents 0982937 + 6b0f9e2
commit 0277fad
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 185 deletions.
diff --git a/README.md b/README.md
@@ -139,30 +139,24 @@ ___
 
 ## Using proxy
 If you send too many requests the site blocks ip for up to one minute and DDGS will raise an exception.
-In this case, you need repeat again after a while or to use a proxy ([httpx documentation](https://www.python-httpx.org/advanced)).
+In this case, you need repeat again after a while or to use a proxy ([documentation](https://www.python-httpx.org/advanced/#http-proxying)).
 You can set a timeout if the proxy takes a long time to respond (default timeout=10).
 
 *1. The easiest way. Launch the Tor Browser*
 ```python3
 from duckduckgo_search import DDGS
 
-proxies = {
-    "all://": "socks5h://localhost:9150",
-}
-ddgs_text_gen = DDGS(proxies=proxies, timeout=20).text("something you need")
-for r in ddgs_text_gen:
-    print(r)
+with DDGS(proxies="socks5://localhost:9150", timeout=20) as ddgs:
+    for r in ddgs.text("something you need"):
+        print(r)
 ```
 *2. Use any proxy server* (*example with [iproyal residential proxies](https://iproyal.com?r=residential_proxies)*)
 ```python3
 from duckduckgo_search import DDGS
 
-proxies = {
-    "all://": "https://user:password@geo.iproyal.com:32325",
-}
-ddgs_text_gen = DDGS(proxies=proxies, timeout=20).text("something you need")
-for r in ddgs_text_gen:
-    print(r)
+with DDGS(proxies="socks5://user:password@geo.iproyal.com:32325", timeout=20) as ddgs:
+    for r in ddgs.text("something you need"):
+        print(r)
 ```
 
 [Go To TOP](#TOP)
@@ -196,25 +190,22 @@ def text(
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'live free or die'
-ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
-for r in ddgs_text_gen:
-    print(r)
+with DDGS() as ddgs:
+    for r in ddgs.text('live free or die', region='wt-wt', safesearch='Off', timelimit='y'):
+        print(r)
 
 # Searching for pdf files
-keywords = 'russia filetype:pdf'
-ddgs_text_gen = ddgs.text(keywords, region='wt-wt', safesearch='Off', timelimit='y')
-for r in ddgs_text_gen:
-    print(r)
+with DDGS() as ddgs:
+    for r in ddgs.text('russia filetype:pdf', region='wt-wt', safesearch='Off', timelimit='y'):
+        print(r)
 
 # Using lite backend and limit the number of results to 10
 from itertools import islice
 
-ddgs_text_gen = DDGS().text("notes from a dead house", backend="lite")
-for r in islice(ddgs_text_gen, 10):
-    print(r)
+with DDGS() as ddgs:
+    ddgs_gen = ddgs.text("notes from a dead house", backend="lite")
+    for r in islice(ddgs_gen, 10):
+        print(r)
 ```
 
 
@@ -238,12 +229,9 @@ def answers(keywords: str) -> Generator[dict, None, None]:
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'sun'
-ddgs_answers_gen = ddgs.answers(keywords)
-for r in ddgs_answers_gen:
-    print(r)
+with DDGS() as ddgs:
+    for r in ddgs.answers("sun"):
+        print(r)
 ```
 
 [Go To TOP](#TOP)
@@ -289,21 +277,20 @@ def images(
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'butterfly'
-ddgs_images_gen = ddgs.images(
-    keywords,
-    region="wt-wt",
-    safesearch="Off",
-    size=None,
-    color="Monochrome",
-    type_image=None,
-    layout=None,
-    license_image=None,
-)
-for r in ddgs_images_gen:
-    print(r)
+with DDGS() as ddgs:
+    keywords = 'butterfly'
+    ddgs_images_gen = ddgs.images(
+      keywords,
+      region="wt-wt",
+      safesearch="Off",
+      size=None,
+      color="Monochrome",
+      type_image=None,
+      layout=None,
+      license_image=None,
+    )
+    for r in ddgs_images_gen:
+        print(r)
 ```
 
 [Go To TOP](#TOP)
@@ -340,19 +327,18 @@ def videos(
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'tesla'
-ddgs_videos_gen = ddgs.videos(
-    keywords,
-    region="wt-wt",
-    safesearch="Off",
-    timelimit="w",
-    resolution="high",
-    duration="medium",
-)
-for r in ddgs_videos_gen:
-    print(r)
+with DDGS() as ddgs:
+    keywords = 'tesla'
+    ddgs_videos_gen = ddgs.videos(
+      keywords,
+      region="wt-wt",
+      safesearch="Off",
+      timelimit="w",
+      resolution="high",
+      duration="medium",
+    )
+    for r in ddgs_videos_gen:
+        print(r)
 ```
 
 
@@ -384,17 +370,16 @@ def news(
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'How soon the sun will die'
-ddgs_news_gen = ddgs.news(
-    keywords,
-    region="wt-wt",
-    safesearch="Off",
-    timelimit="m",
-)
-for r in ddgs_news_gen:
-    print(r)
+with DDGS() as ddgs:
+    keywords = 'How soon the sun will die'
+    ddgs_news_gen = ddgs.news(
+      keywords,
+      region="wt-wt",
+      safesearch="Off",
+      timelimit="m",
+    )
+    for r in ddgs_news_gen:
+        print(r)
 ```
 
 [Go To TOP](#TOP)
@@ -440,15 +425,9 @@ def maps(
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'school'
-ddgs_maps_gen = ddgs.maps(
-    keywords,
-    place="Uganda",
-)
-for r in ddgs_maps_gen:
-    print(r)
+with DDGS() as ddgs:
+    for r in ddgs.maps("school", place="Uganda"):
+        print(r)
 ```
 
 [Go To TOP](#TOP)
@@ -477,11 +456,10 @@ def translate(
 ```python
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'school'
-r = ddgs.translate(keywords, to="de")
-print(r)
+with DDGS() as ddgs:
+    keywords = 'school'
+    r = ddgs.translate(keywords, to="de")
+    print(r)
 ```
 
 [Go To TOP](#TOP)
@@ -508,12 +486,9 @@ def suggestions(
 ```python3
 from duckduckgo_search import DDGS
 
-ddgs = DDGS()
-
-keywords = 'fly'
-ddgs_suggestions_gen = ddgs.suggestions(keywords)
-for r in ddgs_suggestions_gen:
-    print(r)
+with DDGS() as ddgs:
+    for r in ddgs.suggestions("fly)
+        print(r)
 ```
 
 [Go To TOP](#TOP)
diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
@@ -4,14 +4,14 @@
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
-from shutil import copyfileobj
+from random import choice
 from urllib.parse import unquote
 
 import click
 import httpx
 
 # isort: off
-from .duckduckgo_search import DDGS
+from .duckduckgo_search import DDGS, USERAGENTS
 from .version import __version__
 
 # isort: on
@@ -77,23 +77,25 @@ def print_data(data):
 
 def sanitize_keywords(keywords):
     keywords = (
-        keywords.replace(" filetype:", "_")
+        keywords.replace("filetype", "")
+        .replace(":", "")
         .replace('"', "'")
-        .replace("site:", "")
+        .replace("site", "")
         .replace(" ", "_")
         .replace("/", "_")
         .replace("\\", "_")
+        .replace(" ", "")
     )
     return keywords
 
 
 def download_file(url, dir_path, filename):
     headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
+        "User-Agent": choice(USERAGENTS),
     }
     try:
-        with open(os.path.join(dir_path, filename), "wb") as file:
-            with httpx.stream("GET", url, headers=headers) as resp:
+        with httpx.stream("GET", url, headers=headers) as resp:
+            with open(os.path.join(dir_path, filename), "wb") as file:
                 for chunk in resp.iter_bytes():
                     file.write(chunk)
         logger.info(f"File downloaded {url}")

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -6,22 +6,37 @@
 from decimal import Decimal
 from html import unescape
 from itertools import cycle
+from random import choice
 from time import sleep
-from typing import Deque, Dict, Iterator, Optional, Set
+from typing import Deque, Dict, Iterator, Optional, Set, Union
 from urllib.parse import unquote
 
 import httpx
-import requests
 from lxml import html
 
 logger = logging.getLogger(__name__)
 
+REGEX_500_IN_URL = re.compile(r"[0-9]{3}-[0-9]{2}.js")
+REGEX_STRIP_TAGS = re.compile("<.*?>")
+
+USERAGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.4; rv:109.0) Gecko/20100101 Firefox/113.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
+    "Mozilla/5.0 (Windows NT 10.0; WOW64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
+]
 HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
+    "User-Agent": choice(USERAGENTS),
     "Referer": "https://duckduckgo.com/",
 }
-REGEX_500_IN_URL = re.compile(r"[0-9]{3}-[0-9]{2}.js")
-REGEX_STRIP_TAGS = re.compile("<.*?>")
 
 
 @dataclass
@@ -46,7 +61,7 @@ class DDGS:
     def __init__(
         self,
         headers: Optional[Dict[str, str]] = None,
-        proxies: Optional[Dict] = None,
+        proxies: Optional[Union[Dict, str]] = None,
         timeout: int = 10,
     ) -> None:
         self._client = httpx.Client(
@@ -67,7 +82,9 @@ def _get_url(
     ) -> Optional[httpx._models.Response]:
         for i in range(3):
             try:
-                resp = self._client.request(method, url, **kwargs)
+                resp = self._client.request(
+                    method, url, follow_redirects=True, **kwargs
+                )
                 if self._is_500_in_url(str(resp.url)) or resp.status_code == 202:
                     raise httpx._exceptions.HTTPError("")
                 resp.raise_for_status()
@@ -335,9 +352,9 @@ def _text_lite(
                 elif i == 3:
                     result_exists = True
                     yield {
+                        "title": self._normalize(title),
                         "href": href,
-                        "title": title,
-                        "body": body,
+                        "body": self._normalize(body),
                     }
             if result_exists is False:
                 break
@@ -668,9 +685,7 @@ def suggestions(
             "q": keywords,
             "kl": region,
         }
-        resp = self._get_url(
-            "GET", "https://duckduckgo.com/ac", params=payload, follow_redirects=True
-        )
+        resp = self._get_url("GET", "https://duckduckgo.com/ac", params=payload)
         if resp is None:
             return None
         try:
@@ -858,11 +873,13 @@ def translate(
         payload = {
             "vqd": vqd,
             "query": "translate",
-            "from": from_,
             "to": to,
         }
+        if from_:
+            payload["from"] = from_
 
-        resp = requests.post(
+        resp = self._get_url(
+            "POST",
             "https://duckduckgo.com/translation.js",
             params=payload,
             data=keywords.encode(),

diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py
@@ -1 +1 @@
-__version__ = "3.6.0"
+__version__ = "3.7.0"
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,8 +30,7 @@ classifiers = [
 dependencies = [
     "click>=8.1.3",
     "lxml>=4.9.2",
-    "httpx[http2]>=0.24.1",
-    "requests>=2.31.0",
+    "httpx[http2,socks]>=0.24.1",
 ]
 dynamic = ["version"]