v3.9.9 (#140)

1) text(backend="html"/"lite"): removed ad results, 2) README: added exceptions, removed unnecessary async examples.
deedy5 · Nov 28, 2023 · 2deb5bb · 2deb5bb
1 parent 8349f9d
commit 2deb5bb
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 119 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Search for words, documents, images, videos, news, maps and text translation usi
 * [Regions](#regions)
 * [DDGS and AsyncDDGS classes](#ddgs-and-asyncddgs-classes)
 * [Using proxy](#using-proxy)
+* [Exceptions](#exceptions)
 * [1. text() - text search](#1-text---text-search-by-duckduckgocom)
 * [2. answers() - instant answers](#2-answers---instant-answers-by-duckduckgocom)
 * [3. images() - image search](#3-images---image-search-by-duckduckgocom)
@@ -196,7 +197,7 @@ async def main():
 if __name__ == "__main__":
     asyncio.run(main())
 ```
-It is important to note that the DDGS and AsyncDDGS classes should always be used as a context manager (with statement). 
+It is important to note that the DDGS and AsyncDDGS classes should always be used as a context manager (with statement).
 This ensures proper resource management and cleanup, as the context manager will automatically handle opening and closing the HTTP client connection.
 
 [Go To TOP](#TOP)
@@ -218,18 +219,18 @@ with DDGS(proxies="socks5://user:password@geo.iproyal.com:32325", timeout=20) as
     for r in ddgs.text("something you need", max_results=50):
         print(r)
 ```
-*3. Async*
-```python3
-import asyncio
-from duckduckgo_search import AsyncDDGS
 
-async def get_results():
-    async with AsyncDDGS(proxies="socks5://user:password@geo.iproyal.com:32325", timeout=20) as ddgs:
-        async for r in ddgs.text("cat", max_results=50):
-            print(r)
+[Go To TOP](#TOP)
 
-asyncio.run(get_results())
-```
+## Exceptions
+
+Exceptions:
+- `APIException`: Raised when there is an issue with the API request.
+- `DuckDuckGoSearchException`: Raised when there is a generic exception during the API request.
+- `HTTPException`: Raised when there is an HTTP error during the API request.
+- `RateLimitException`: Raised when the API rate limit is exceeded.
+- `TimeoutException`: Raised when there is a timeout during the API request.
+- `VQDExtractionException`: Raised when there is an error extracting the VQD value for a search query.
 
 [Go To TOP](#TOP)
 
@@ -274,19 +275,6 @@ with DDGS() as ddgs:
     for r in ddgs.text('russia filetype:pdf', region='wt-wt', safesearch='off', timelimit='y', max_results=10):
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for result in ddgs.text("cat", max_results=50):
-            print(result)
-
-asyncio.run(get_results())
-```
-
 
 [Go To TOP](#TOP)
 
@@ -312,18 +300,6 @@ with DDGS() as ddgs:
     for r in ddgs.answers("sun"):
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for r in ddgs.answers("sun"):
-            print(r)
-
-asyncio.run(get_results())
-```
 
 [Go To TOP](#TOP)
 
@@ -386,18 +362,6 @@ with DDGS() as ddgs:
     for r in ddgs_images_gen:
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for r in ddgs.images("butterfly", max_results=50):
-            print(r)
-
-asyncio.run(get_results())
-```
 
 [Go To TOP](#TOP)
 
@@ -449,19 +413,6 @@ with DDGS() as ddgs:
     for r in ddgs_videos_gen:
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for r in ddgs.videos("tesla", max_results=50):
-            print(r)
-
-asyncio.run(get_results())
-```
-
 
 [Go To TOP](#TOP)
 
@@ -505,18 +456,6 @@ with DDGS() as ddgs:
     for r in ddgs_news_gen:
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for r in ddgs.news("holiday", max_results=15):
-            print(r)
-
-asyncio.run(get_results())
-```
 
 [Go To TOP](#TOP)
 
@@ -567,18 +506,6 @@ with DDGS() as ddgs:
     for r in ddgs.maps("school", place="Uganda", max_results=50):
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for r in ddgs.maps("school", place="Berlin", max_results=50):
-            print(r)
-
-asyncio.run(get_results())
-```
 
 [Go To TOP](#TOP)
 
@@ -611,22 +538,9 @@ with DDGS() as ddgs:
     r = ddgs.translate(keywords, to="de")
     print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        r = await ddgs.translate("school", to="de"):
-        print(r)
-
-asyncio.run(get_results())
-```
 
 [Go To TOP](#TOP)
 
-
 ## 8. suggestions() - suggestions by duckduckgo.com
 
 ```python
@@ -652,17 +566,5 @@ with DDGS() as ddgs:
     for r in ddgs.suggestions("fly"):
         print(r)
 ```
-***Async***
-```python
-import asyncio
-from duckduckgo_search import AsyncDDGS
-
-async def get_results():
-    async with AsyncDDGS() as ddgs:
-        async for r in ddgs.suggestions("fly"):
-            print(r)
-
-asyncio.run(get_results())
-```
 
 [Go To TOP](#TOP)
diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -43,14 +43,16 @@ def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._models.Re
         try:
             resp = self._client.request(method, url, follow_redirects=True, **kwargs)
             if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
-                raise APIException(f"_get_url() {url} 500 in url")
+                raise APIException(f"_get_url() {url}")
             if resp.status_code == 202:
-                raise RateLimitException(f"_get_url() {url} RateLimitError: resp.status_code==202")
+                raise RateLimitException(f"_get_url() {url}")
             if resp.status_code == 200:
                 return resp
             resp.raise_for_status()
         except httpx.TimeoutException as ex:
             raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
+        except (APIException, RateLimitException):
+            raise
         except httpx.HTTPError as ex:
             raise HTTPException(f"_get_url() {url} HttpError: {ex}")
         except Exception as ex:
@@ -227,7 +229,12 @@ def _text_html(
             for e in tree.xpath('//div[contains(@class, "results_links")]'):
                 href = e.xpath('.//a[contains(@class, "result__a")]/@href')
                 href = href[0] if href else None
-                if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
+                if (
+                    href
+                    and href not in cache
+                    and href != f"http://www.google.com/search?q={keywords}"
+                    and not href.startswith("https://duckduckgo.com/y.js?ad_domain")
+                ):
                     cache.add(href)
                     title = e.xpath('.//a[contains(@class, "result__a")]/text()')
                     body = e.xpath('.//a[contains(@class, "result__snippet")]//text()')
@@ -296,7 +303,12 @@ def _text_lite(
                 if i == 1:
                     href = e.xpath(".//a//@href")
                     href = href[0] if href else None
-                    if href is None or href in cache or href == f"http://www.google.com/search?q={keywords}":
+                    if (
+                        href is None
+                        or href in cache
+                        or href == f"http://www.google.com/search?q={keywords}"
+                        or href.startswith("https://duckduckgo.com/y.js?ad_domain")
+                    ):
                         [next(data, None) for _ in range(3)]  # skip block(i=1,2,3,4)
                     else:
                         cache.add(href)

diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py
@@ -43,14 +43,16 @@ async def _get_url(self, method: str, url: str, **kwargs) -> Optional[httpx._mod
         try:
             resp = await self._client.request(method, url, follow_redirects=True, **kwargs)
             if _is_500_in_url(str(resp.url)) or resp.status_code == 403:
-                raise APIException(f"_get_url() {url} 500 in url")
+                raise APIException(f"_get_url() {url}")
             if resp.status_code == 202:
-                raise RateLimitException(f"_get_url() {url} RateLimitError: resp.status_code==202")
+                raise RateLimitException(f"_get_url() {url}")
             if resp.status_code == 200:
                 return resp
             resp.raise_for_status()
         except httpx.TimeoutException as ex:
             raise TimeoutException(f"_get_url() {url} TimeoutException: {ex}")
+        except (APIException, RateLimitException):
+            raise
         except httpx.HTTPError as ex:
             raise HTTPException(f"_get_url() {url} HttpError: {ex}")
         except Exception as ex:
@@ -228,7 +230,12 @@ async def _text_html(
             for e in tree.xpath('//div[contains(@class, "results_links")]'):
                 href = e.xpath('.//a[contains(@class, "result__a")]/@href')
                 href = href[0] if href else None
-                if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
+                if (
+                    href
+                    and href not in cache
+                    and href != f"http://www.google.com/search?q={keywords}"
+                    and not href.startswith("https://duckduckgo.com/y.js?ad_domain")
+                ):
                     cache.add(href)
                     title = e.xpath('.//a[contains(@class, "result__a")]/text()')
                     body = e.xpath('.//a[contains(@class, "result__snippet")]//text()')
@@ -297,7 +304,12 @@ async def _text_lite(
                 if i == 1:
                     href = e.xpath(".//a//@href")
                     href = href[0] if href else None
-                    if href is None or href in cache or href == f"http://www.google.com/search?q={keywords}":
+                    if (
+                        href is None
+                        or href in cache
+                        or href == f"http://www.google.com/search?q={keywords}"
+                        or href.startswith("https://duckduckgo.com/y.js?ad_domain")
+                    ):
                         [next(data, None) for _ in range(3)]  # skip block(i=1,2,3,4)
                     else:
                         cache.add(href)

diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py
@@ -1 +1 @@
-__version__ = "3.9.8"
+__version__ = "3.9.9"