Skip to content

Commit

Permalink
V3.2.0 (#73)
Browse files Browse the repository at this point in the history
1. get_url() will raise an exception after unsuccessful retries instead of returning None,
2. assert vqd istead of returning none,
3. update requirements,
4. update README.
  • Loading branch information
deedy5 authored May 23, 2023
1 parent f241773 commit cd0b070
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 93 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,10 @@ ___
[Go To TOP](#TOP)

## Using proxy
The site blocks ip for a few minutes if you send too many requests. In this case, you need to use a proxy.
Also you can set a timeout if the proxy takes a long time to respond (default timeout=10).
If you send too many requests the site blocks ip for up to one minute and DDGS will raise an exception
`requests.exceptions.HTTPError: 418 Client Error: for url: https://duckduckgo.com/`.
In this case, you need repeat again after a while or to use a proxy.
You can set a timeout if the proxy takes a long time to respond (default timeout=10).

*1. The easiest way. Launch the Tor Browser*
```python3
Expand Down
173 changes: 85 additions & 88 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def __init__(
proxies: Optional[Dict[str, str]] = None,
timeout: int = 10,
) -> None:
self.headers = headers if headers else HEADERS
self._session = requests.Session()
self._session.headers.update(headers if headers else HEADERS)
self._session.proxies.update(proxies if proxies else {})
Expand All @@ -64,10 +63,11 @@ def _get_url(self, method: str, url: str, **kwargs) -> Optional[Response]:
raise requests.HTTPError
resp.raise_for_status()
return resp
except (HTTPError, Timeout) as ex:
except Exception as ex:
logger.warning(f"_get_url() {url} {type(ex).__name__} {ex}")
if i < 2:
sleep(2**i)
if i >= 2 or "418" in str(ex):
raise ex
sleep(2**i)
return None

def _get_vqd(self, keywords: str) -> Optional[str]:
Expand Down Expand Up @@ -119,8 +119,7 @@ def text(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
if not vqd:
return None
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": -1, "off": -2}
payload = {
Expand All @@ -145,29 +144,30 @@ def text(
except (AttributeError, JSONDecodeError):
break

result_exists = False
for row in page_data:
if "n" in row:
payload["s"] = row["n"].split("s=")[-1].split("&")[0] # next page
href = row.get("u", None)
if (
href
and href not in cache
and href != f"http://www.google.com/search?q={keywords}"
):
cache.add(href)
body = self._normalize(row["a"])
if body:
result_exists = True
yield {
"title": self._normalize(row["t"]),
"href": href,
"body": body,
}
elif result_exists is False:
if page_data:
result_exists = False
for row in page_data:
if "n" in row:
payload["s"] = row["n"].split("s=")[-1].split("&")[0]
href = row.get("u", None)
if (
href
and href not in cache
and href != f"http://www.google.com/search?q={keywords}"
):
cache.add(href)
body = self._normalize(row["a"])
if body:
result_exists = True
yield {
"title": self._normalize(row["t"]),
"href": href,
"body": body,
}
elif result_exists is False:
break
if result_exists is False:
break
if result_exists is False:
break

def images(
self,
Expand Down Expand Up @@ -206,8 +206,7 @@ def images(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
if not vqd:
return None
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": 1, "off": -1}
timelimit = f"time:{timelimit}" if timelimit else ""
Expand Down Expand Up @@ -237,26 +236,27 @@ def images(
break

page_data = resp_json.get("results", None)
result_exists = False
for row in page_data:
image_url = row.get("image", None)
if image_url and image_url not in cache:
cache.add(image_url)
result_exists = True
yield {
"title": row["title"],
"image": image_url,
"thumbnail": row["thumbnail"],
"url": row["url"],
"height": row["height"],
"width": row["width"],
"source": row["source"],
}
next = resp_json.get("next", None)
if next:
payload["s"] = next.split("s=")[-1].split("&")[0]
if next is None or result_exists is False:
break
if page_data:
result_exists = False
for row in page_data:
image_url = row.get("image", None)
if image_url and image_url not in cache:
cache.add(image_url)
result_exists = True
yield {
"title": row["title"],
"image": image_url,
"thumbnail": row["thumbnail"],
"url": row["url"],
"height": row["height"],
"width": row["width"],
"source": row["source"],
}
next = resp_json.get("next", None)
if next:
payload["s"] = next.split("s=")[-1].split("&")[0]
if next is None or result_exists is False:
break

def videos(
self,
Expand Down Expand Up @@ -286,8 +286,7 @@ def videos(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
if not vqd:
return None
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": -1, "off": -2}
timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
Expand Down Expand Up @@ -350,8 +349,7 @@ def news(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
if not vqd:
return None
assert vqd, "error in getting vqd"

safesearch_base = {"on": 1, "moderate": -1, "off": -2}
payload = {
Expand Down Expand Up @@ -539,8 +537,7 @@ def maps(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd(keywords)
if not vqd:
return None
assert vqd, "error in getting vqd"

# if longitude and latitude are specified, skip the request about bbox to the nominatim api
if latitude and longitude:
Expand Down Expand Up @@ -623,36 +620,37 @@ def maps(
except (AttributeError, JSONDecodeError):
break

for res in page_data:
result = MapsResult()
result.title = res["name"]
result.address = res["address"]
if f"{result.title} {result.address}" in cache:
continue
else:
cache.add(f"{result.title} {result.address}")
result.country_code = res["country_code"]
result.url = res["website"]
result.phone = res["phone"]
result.latitude = res["coordinates"]["latitude"]
result.longitude = res["coordinates"]["longitude"]
result.source = unquote(res["url"])
if res["embed"]:
result.image = res["embed"].get("image", "")
result.links = res["embed"].get("third_party_links", "")
result.desc = res["embed"].get("description", "")
result.hours = res["hours"]
yield result.__dict__

# divide the square into 4 parts and add to the queue
if len(page_data) >= 15:
lat_middle = (lat_t + lat_b) / 2
lon_middle = (lon_l + lon_r) / 2
bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])
if page_data:
for res in page_data:
result = MapsResult()
result.title = res["name"]
result.address = res["address"]
if f"{result.title} {result.address}" in cache:
continue
else:
cache.add(f"{result.title} {result.address}")
result.country_code = res["country_code"]
result.url = res["website"]
result.phone = res["phone"]
result.latitude = res["coordinates"]["latitude"]
result.longitude = res["coordinates"]["longitude"]
result.source = unquote(res["url"])
if res["embed"]:
result.image = res["embed"].get("image", "")
result.links = res["embed"].get("third_party_links", "")
result.desc = res["embed"].get("description", "")
result.hours = res["hours"]
yield result.__dict__

# divide the square into 4 parts and add to the queue
if len(page_data) >= 15:
lat_middle = (lat_t + lat_b) / 2
lon_middle = (lon_l + lon_r) / 2
bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
work_bboxes.extendleft([bbox1, bbox2, bbox3, bbox4])

def translate(
self,
Expand All @@ -674,8 +672,7 @@ def translate(
assert keywords, "keywords is mandatory"

vqd = self._get_vqd("translate")
if not vqd:
return None
assert vqd, "error in getting vqd"

payload = {
"vqd": vqd,
Expand Down
2 changes: 1 addition & 1 deletion duckduckgo_search/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.1.1"
__version__ = "3.2.0"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ classifiers = [
]
dependencies = [
"click>=8.1.3",
"requests>=2.30.0",
"requests>=2.31.0",
]
dynamic = ["version"]

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
click>=8.1.3
requests>=2.30.0
requests>=2.31.0

0 comments on commit cd0b070

Please sign in to comment.