diff --git a/README.md b/README.md index 75fb798..348efe9 100755 --- a/README.md +++ b/README.md @@ -139,8 +139,10 @@ ___ [Go To TOP](#TOP) ## Using proxy -The site blocks ip for a few minutes if you send too many requests. In this case, you need to use a proxy. -Also you can set a timeout if the proxy takes a long time to respond (default timeout=10). +If you send too many requests the site blocks ip for up to one minute and DDGS will raise an exception +`requests.exceptions.HTTPError: 418 Client Error: for url: https://duckduckgo.com/`. +In this case, you need repeat again after a while or to use a proxy. +You can set a timeout if the proxy takes a long time to respond (default timeout=10). *1. The easiest way. Launch the Tor Browser* ```python3 diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py index c776860..c0b8238 100644 --- a/duckduckgo_search/duckduckgo_search.py +++ b/duckduckgo_search/duckduckgo_search.py @@ -48,7 +48,6 @@ def __init__( proxies: Optional[Dict[str, str]] = None, timeout: int = 10, ) -> None: - self.headers = headers if headers else HEADERS self._session = requests.Session() self._session.headers.update(headers if headers else HEADERS) self._session.proxies.update(proxies if proxies else {}) @@ -64,10 +63,11 @@ def _get_url(self, method: str, url: str, **kwargs) -> Optional[Response]: raise requests.HTTPError resp.raise_for_status() return resp - except (HTTPError, Timeout) as ex: + except Exception as ex: logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}") - if i < 2: - sleep(2**i) + if i >= 2 or "418" in str(ex): + raise ex + sleep(2**i) return None def _get_vqd(self, keywords: str) -> Optional[str]: @@ -119,8 +119,7 @@ def text( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - if not vqd: - return None + assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { @@ -145,29 +144,30 @@ def text( except (AttributeError, JSONDecodeError): break - result_exists = False - for row in page_data: - if "n" in row: - payload["s"] = row["n"].split("s=")[-1].split("&")[0] # next page - href = row.get("u", None) - if ( - href - and href not in cache - and href != f"http://www.google.com/search?q={keywords}" - ): - cache.add(href) - body = self._normalize(row["a"]) - if body: - result_exists = True - yield { - "title": self._normalize(row["t"]), - "href": href, - "body": body, - } - elif result_exists is False: + if page_data: + result_exists = False + for row in page_data: + if "n" in row: + payload["s"] = row["n"].split("s=")[-1].split("&")[0] + href = row.get("u", None) + if ( + href + and href not in cache + and href != f"http://www.google.com/search?q={keywords}" + ): + cache.add(href) + body = self._normalize(row["a"]) + if body: + result_exists = True + yield { + "title": self._normalize(row["t"]), + "href": href, + "body": body, + } + elif result_exists is False: + break + if result_exists is False: break - if result_exists is False: - break def images( self, @@ -206,8 +206,7 @@ def images( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - if not vqd: - return None + assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": 1, "off": -1} timelimit = f"time:{timelimit}" if timelimit else "" @@ -237,26 +236,27 @@ def images( break page_data = resp_json.get("results", None) - result_exists = False - for row in page_data: - image_url = row.get("image", None) - if image_url and image_url not in cache: - cache.add(image_url) - result_exists = True - yield { - "title": row["title"], - "image": image_url, - "thumbnail": row["thumbnail"], - "url": row["url"], - "height": row["height"], - "width": row["width"], - "source": row["source"], - } - next = resp_json.get("next", None) - if next: - payload["s"] = next.split("s=")[-1].split("&")[0] - if next is None or result_exists is False: - break + if page_data: + result_exists = False + for row in page_data: + image_url = row.get("image", None) + if image_url and image_url not in cache: + cache.add(image_url) + result_exists = True + yield { + "title": row["title"], + "image": image_url, + "thumbnail": row["thumbnail"], + "url": row["url"], + "height": row["height"], + "width": row["width"], + "source": row["source"], + } + next = resp_json.get("next", None) + if next: + payload["s"] = next.split("s=")[-1].split("&")[0] + if next is None or result_exists is False: + break def videos( self, @@ -286,8 +286,7 @@ def videos( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - if not vqd: - return None + assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} timelimit = f"publishedAfter:{timelimit}" if timelimit else "" @@ -350,8 +349,7 @@ def news( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - if not vqd: - return None + assert vqd, "error in getting vqd" safesearch_base = {"on": 1, "moderate": -1, "off": -2} payload = { @@ -539,8 +537,7 @@ def maps( assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) - if not vqd: - return None + assert vqd, "error in getting vqd" # if longitude and latitude are specified, skip the request about bbox to the nominatim api if latitude and longitude: @@ -623,36 +620,37 @@ def maps( except (AttributeError, JSONDecodeError): break - for res in page_data: - result = MapsResult() - result.title = res["name"] - result.address = res["address"] - if f"{result.title} {result.address}" in cache: - continue - else: - cache.add(f"{result.title} {result.address}") - result.country_code = res["country_code"] - result.url = res["website"] - result.phone = res["phone"] - result.latitude = res["coordinates"]["latitude"] - result.longitude = res["coordinates"]["longitude"] - result.source = unquote(res["url"]) - if res["embed"]: - result.image = res["embed"].get("image", "") - result.links = res["embed"].get("third_party_links", "") - result.desc = res["embed"].get("description", "") - result.hours = res["hours"] - yield result.__dict__ - - # divide the square into 4 parts and add to the queue - if len(page_data) >= 15: - lat_middle = (lat_t + lat_b) / 2 - lon_middle = (lon_l + lon_r) / 2 - bbox1 = (lat_t, lon_l, lat_middle, lon_middle) - bbox2 = (lat_t, lon_middle, lat_middle, lon_r) - bbox3 = (lat_middle, lon_l, lat_b, lon_middle) - bbox4 = (lat_middle, lon_middle, lat_b, lon_r) - work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4]) + if page_data: + for res in page_data: + result = MapsResult() + result.title = res["name"] + result.address = res["address"] + if f"{result.title} {result.address}" in cache: + continue + else: + cache.add(f"{result.title} {result.address}") + result.country_code = res["country_code"] + result.url = res["website"] + result.phone = res["phone"] + result.latitude = res["coordinates"]["latitude"] + result.longitude = res["coordinates"]["longitude"] + result.source = unquote(res["url"]) + if res["embed"]: + result.image = res["embed"].get("image", "") + result.links = res["embed"].get("third_party_links", "") + result.desc = res["embed"].get("description", "") + result.hours = res["hours"] + yield result.__dict__ + + # divide the square into 4 parts and add to the queue + if len(page_data) >= 15: + lat_middle = (lat_t + lat_b) / 2 + lon_middle = (lon_l + lon_r) / 2 + bbox1 = (lat_t, lon_l, lat_middle, lon_middle) + bbox2 = (lat_t, lon_middle, lat_middle, lon_r) + bbox3 = (lat_middle, lon_l, lat_b, lon_middle) + bbox4 = (lat_middle, lon_middle, lat_b, lon_r) + work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4]) def translate( self, @@ -674,8 +672,7 @@ def translate( assert keywords, "keywords is mandatory" vqd = self._get_vqd("translate") - if not vqd: - return None + assert vqd, "error in getting vqd" payload = { "vqd": vqd, diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py index d539d50..1173108 100755 --- a/duckduckgo_search/version.py +++ b/duckduckgo_search/version.py @@ -1 +1 @@ -__version__ = "3.1.1" +__version__ = "3.2.0" diff --git a/pyproject.toml b/pyproject.toml index 87a7be5..cdf6f20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ classifiers = [ ] dependencies = [ "click>=8.1.3", - "requests>=2.30.0", + "requests>=2.31.0", ] dynamic = ["version"] diff --git a/requirements.txt b/requirements.txt index 5453167..d073b4e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ click>=8.1.3 -requests>=2.30.0 +requests>=2.31.0