V3.2.0 (#73)

1. get_url() will raise an exception after unsuccessful retries instead of returning None, 2. assert vqd istead of returning none, 3. update requirements, 4. update README.
deedy5 · May 23, 2023 · cd0b070 · cd0b070
1 parent f241773
commit cd0b070
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 93 deletions.
diff --git a/README.md b/README.md
@@ -139,8 +139,10 @@ ___
 [Go To TOP](#TOP)
 
 ## Using proxy
-The site blocks ip for a few minutes if you send too many requests. In this case, you need to use a proxy.
-Also you can set a timeout if the proxy takes a long time to respond (default timeout=10).
+If you send too many requests the site blocks ip for up to one minute and DDGS will raise an exception 
+`requests.exceptions.HTTPError: 418 Client Error:  for url: https://duckduckgo.com/`.
+In this case, you need repeat again after a while or to use a proxy. 
+You can set a timeout if the proxy takes a long time to respond (default timeout=10).
 
 *1. The easiest way. Launch the Tor Browser*
 ```python3

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -48,7 +48,6 @@ def __init__(
         proxies: Optional[Dict[str, str]] = None,
         timeout: int = 10,
     ) -> None:
-        self.headers = headers if headers else HEADERS
         self._session = requests.Session()
         self._session.headers.update(headers if headers else HEADERS)
         self._session.proxies.update(proxies if proxies else {})
@@ -64,10 +63,11 @@ def _get_url(self, method: str, url: str, **kwargs) -> Optional[Response]:
                     raise requests.HTTPError
                 resp.raise_for_status()
                 return resp
-            except (HTTPError, Timeout) as ex:
+            except Exception as ex:
                 logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}")
-                if i < 2:
-                    sleep(2**i)
+                if i >= 2 or "418" in str(ex):
+                    raise ex
+                sleep(2**i)
         return None
 
     def _get_vqd(self, keywords: str) -> Optional[str]:
@@ -119,8 +119,7 @@ def text(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        if not vqd:
-            return None
+        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
@@ -145,29 +144,30 @@ def text(
             except (AttributeError, JSONDecodeError):
                 break
 
-            result_exists = False
-            for row in page_data:
-                if "n" in row:
-                    payload["s"] = row["n"].split("s=")[-1].split("&")[0]  # next page
-                href = row.get("u", None)
-                if (
-                    href
-                    and href not in cache
-                    and href != f"http://www.google.com/search?q={keywords}"
-                ):
-                    cache.add(href)
-                    body = self._normalize(row["a"])
-                    if body:
-                        result_exists = True
-                        yield {
-                            "title": self._normalize(row["t"]),
-                            "href": href,
-                            "body": body,
-                        }
-                elif result_exists is False:
+            if page_data:
+                result_exists = False
+                for row in page_data:
+                    if "n" in row:
+                        payload["s"] = row["n"].split("s=")[-1].split("&")[0]
+                    href = row.get("u", None)
+                    if (
+                        href
+                        and href not in cache
+                        and href != f"http://www.google.com/search?q={keywords}"
+                    ):
+                        cache.add(href)
+                        body = self._normalize(row["a"])
+                        if body:
+                            result_exists = True
+                            yield {
+                                "title": self._normalize(row["t"]),
+                                "href": href,
+                                "body": body,
+                            }
+                    elif result_exists is False:
+                        break
+                if result_exists is False:
                     break
-            if result_exists is False:
-                break
 
     def images(
         self,
@@ -206,8 +206,7 @@ def images(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        if not vqd:
-            return None
+        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": 1, "off": -1}
         timelimit = f"time:{timelimit}" if timelimit else ""
@@ -237,26 +236,27 @@ def images(
                 break
 
             page_data = resp_json.get("results", None)
-            result_exists = False
-            for row in page_data:
-                image_url = row.get("image", None)
-                if image_url and image_url not in cache:
-                    cache.add(image_url)
-                    result_exists = True
-                    yield {
-                        "title": row["title"],
-                        "image": image_url,
-                        "thumbnail": row["thumbnail"],
-                        "url": row["url"],
-                        "height": row["height"],
-                        "width": row["width"],
-                        "source": row["source"],
-                    }
-            next = resp_json.get("next", None)
-            if next:
-                payload["s"] = next.split("s=")[-1].split("&")[0]
-            if next is None or result_exists is False:
-                break
+            if page_data:
+                result_exists = False
+                for row in page_data:
+                    image_url = row.get("image", None)
+                    if image_url and image_url not in cache:
+                        cache.add(image_url)
+                        result_exists = True
+                        yield {
+                            "title": row["title"],
+                            "image": image_url,
+                            "thumbnail": row["thumbnail"],
+                            "url": row["url"],
+                            "height": row["height"],
+                            "width": row["width"],
+                            "source": row["source"],
+                        }
+                next = resp_json.get("next", None)
+                if next:
+                    payload["s"] = next.split("s=")[-1].split("&")[0]
+                if next is None or result_exists is False:
+                    break
 
     def videos(
         self,
@@ -286,8 +286,7 @@ def videos(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        if not vqd:
-            return None
+        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
@@ -350,8 +349,7 @@ def news(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        if not vqd:
-            return None
+        assert vqd, "error in getting vqd"
 
         safesearch_base = {"on": 1, "moderate": -1, "off": -2}
         payload = {
@@ -539,8 +537,7 @@ def maps(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd(keywords)
-        if not vqd:
-            return None
+        assert vqd, "error in getting vqd"
 
         # if longitude and latitude are specified, skip the request about bbox to the nominatim api
         if latitude and longitude:
@@ -623,36 +620,37 @@ def maps(
             except (AttributeError, JSONDecodeError):
                 break
 
-            for res in page_data:
-                result = MapsResult()
-                result.title = res["name"]
-                result.address = res["address"]
-                if f"{result.title} {result.address}" in cache:
-                    continue
-                else:
-                    cache.add(f"{result.title} {result.address}")
-                    result.country_code = res["country_code"]
-                    result.url = res["website"]
-                    result.phone = res["phone"]
-                    result.latitude = res["coordinates"]["latitude"]
-                    result.longitude = res["coordinates"]["longitude"]
-                    result.source = unquote(res["url"])
-                    if res["embed"]:
-                        result.image = res["embed"].get("image", "")
-                        result.links = res["embed"].get("third_party_links", "")
-                        result.desc = res["embed"].get("description", "")
-                    result.hours = res["hours"]
-                    yield result.__dict__
-
-            # divide the square into 4 parts and add to the queue
-            if len(page_data) >= 15:
-                lat_middle = (lat_t + lat_b) / 2
-                lon_middle = (lon_l + lon_r) / 2
-                bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
-                bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
-                bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
-                bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
-                work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
+            if page_data:
+                for res in page_data:
+                    result = MapsResult()
+                    result.title = res["name"]
+                    result.address = res["address"]
+                    if f"{result.title} {result.address}" in cache:
+                        continue
+                    else:
+                        cache.add(f"{result.title} {result.address}")
+                        result.country_code = res["country_code"]
+                        result.url = res["website"]
+                        result.phone = res["phone"]
+                        result.latitude = res["coordinates"]["latitude"]
+                        result.longitude = res["coordinates"]["longitude"]
+                        result.source = unquote(res["url"])
+                        if res["embed"]:
+                            result.image = res["embed"].get("image", "")
+                            result.links = res["embed"].get("third_party_links", "")
+                            result.desc = res["embed"].get("description", "")
+                        result.hours = res["hours"]
+                        yield result.__dict__
+
+                # divide the square into 4 parts and add to the queue
+                if len(page_data) >= 15:
+                    lat_middle = (lat_t + lat_b) / 2
+                    lon_middle = (lon_l + lon_r) / 2
+                    bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
+                    bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
+                    bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
+                    bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
+                    work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
 
     def translate(
         self,
@@ -674,8 +672,7 @@ def translate(
         assert keywords, "keywords is mandatory"
 
         vqd = self._get_vqd("translate")
-        if not vqd:
-            return None
+        assert vqd, "error in getting vqd"
 
         payload = {
             "vqd": vqd,

diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py
@@ -1 +1 @@
-__version__ = "3.1.1"
+__version__ = "3.2.0"
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ classifiers = [
 ]
 dependencies = [
     "click>=8.1.3",
-    "requests>=2.30.0",
+    "requests>=2.31.0",
 ]
 dynamic = ["version"]
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
 click>=8.1.3
-requests>=2.30.0
+requests>=2.31.0