diff --git a/CHANGELOG.md b/CHANGELOG.md index d973032..f76452e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ 2.0.0 - - __breaking change__: more consistent parameter names for the Stealth object + - __breaking change__: more consistent naming all around + - __breaking change__: Stealth object only takes kwargs now - new methods to hook context: Stealth.use_async and Stealth.use_sync - this allows us to patch CLI options as well, which are often better at faking than JS - chore: name options consistently @@ -7,4 +8,5 @@ - ft: use replaceProperty util which copies existing property descriptors 2b9b4b39 - fix: remove deprecated pkg_resources usage (to support Python 3.12) (#2) - fix: navigator_platform typo (#1) - - ft: better type hinted functions \ No newline at end of file + - ft: better type hinted functions + - ft: User-Agent and Sec-CH-UA spoofing \ No newline at end of file diff --git a/README.md b/README.md index a9d6f7b..ef1e19f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Transplanted from [puppeteer-extra-plugin-stealth](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth), with some improvements. Don't expect this to bypass anything but the simplest of bot detection methods. Consider this a proof-of-concept starting point. -This is a WIP fork, with the goal of replacing the out-of-date upstream. +This is a WIP fork, with the goal of replacing the out-of-date upstream. See the [changelog](./README.md). ## Install diff --git a/playwright_stealth/case_insensitive_dict.py b/playwright_stealth/case_insensitive_dict.py new file mode 100644 index 0000000..3642e2b --- /dev/null +++ b/playwright_stealth/case_insensitive_dict.py @@ -0,0 +1,77 @@ +from collections.abc import MutableMapping + + +# straight from: https://github.com/kennethreitz/requests/blob/master/src/requests/structures.py +class CaseInsensitiveDict(MutableMapping): + """A case-insensitive ``dict``-like object. + + Implements all methods and operations of + ``MutableMapping`` as well as dict's ``copy``. Also + provides ``lower_items``. + + All keys are expected to be strings. The structure remembers the + case of the last key to be set, and ``iter(instance)``, + ``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()`` + will contain case-sensitive keys. However, querying and contains + testing is case insensitive:: + + cid = CaseInsensitiveDict() + cid['Accept'] = 'application/json' + cid['aCCEPT'] == 'application/json' # True + list(cid) == ['Accept'] # True + + For example, ``headers['content-encoding']`` will return the + value of a ``'Content-Encoding'`` response header, regardless + of how the header name was originally stored. + + If the constructor, ``.update``, or equality comparison + operations are given keys that have equal ``.lower()``s, the + behavior is undefined. + """ + + def __init__(self, data=None, **kwargs): + self._store = {} + if data is None: + data = {} + self.update(data, **kwargs) + + def __setitem__(self, key, value): + # Use the lowercased key for lookups, but store the actual + # key alongside the value. + self._store[key.lower()] = (key, value) + + def __getitem__(self, key): + return self._store[key.lower()][1] + + def __delitem__(self, key): + del self._store[key.lower()] + + def __iter__(self): + return (casedkey for casedkey, mappedvalue in self._store.values()) + + def __len__(self): + return len(self._store) + + def lower_items(self): + """Like iteritems(), but with all lowercase keys.""" + return ((lowerkey, keyval[1]) for (lowerkey, keyval) in self._store.items()) + + def __eq__(self, other): + from collections.abc import Mapping + + if isinstance(other, Mapping): + other = CaseInsensitiveDict(other) + else: + return NotImplemented + # Compare insensitively + return dict(self.lower_items()) == dict(other.lower_items()) + + # Copy is required + def copy(self): + return CaseInsensitiveDict(self._store.values()) + + def __repr__(self): + return str(dict(self.items())) + + def items(self): + pass diff --git a/playwright_stealth/stealth.py b/playwright_stealth/stealth.py index 60f9940..1143ac8 100644 --- a/playwright_stealth/stealth.py +++ b/playwright_stealth/stealth.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- import inspect import json +import random import re import warnings from collections.abc import Callable from copy import deepcopy from pathlib import Path -from typing import Dict, List, Union, Any +from typing import Dict, List, Union, Any, Tuple, Optional from playwright import async_api, sync_api +from playwright_stealth.case_insensitive_dict import CaseInsensitiveDict from playwright_stealth.context_managers import ( AsyncWrappingContextManager, SyncWrappingContextManager, @@ -41,8 +43,6 @@ def from_file(name) -> str: "webgl_vendor": from_file("evasions/webgl.vendor.js"), } -from typing import Tuple, Optional - class Stealth: """ @@ -64,12 +64,12 @@ def enabled_scripts(): def __init__( self, - navigator_webdriver: bool = True, - webgl_vendor: bool = True, + *, chrome_app: bool = True, chrome_csi: bool = True, chrome_load_times: bool = True, chrome_runtime: bool = False, + hairline: bool = True, iframe_content_window: bool = True, media_codecs: bool = True, navigator_hardware_concurrency: bool = True, @@ -79,24 +79,25 @@ def __init__( navigator_plugins: bool = True, navigator_user_agent: bool = True, navigator_vendor: bool = True, - hairline: bool = True, - webgl_vendor_override: str = "Intel Inc.", - webgl_renderer_override: str = "Intel Iris OpenGL Engine", - navigator_vendor_override: str = "Google Inc.", - navigator_user_agent_override: Optional[str] = None, - navigator_platform_override: Optional[str] = None, - navigator_languages_override: Tuple[str, str] = ("en-US", "en"), + navigator_webdriver: bool = True, + sec_ch_ua: bool = True, + webgl_vendor: bool = True, chrome_runtime_run_on_insecure_origins: bool = False, + navigator_languages_override: Tuple[str, str] = ("en-US", "en"), + navigator_platform_override: Optional[str] = None, + navigator_user_agent_override: Optional[str] = None, + navigator_vendor_override: str = "Google Inc.", + webgl_renderer_override: str = "Intel Iris OpenGL Engine", + webgl_vendor_override: str = "Intel Inc.", init_scripts_only: bool = False, script_logging: bool = False, ): # scripts to load - self.navigator_webdriver: bool = navigator_webdriver - self.webgl_vendor: bool = webgl_vendor self.chrome_app: bool = chrome_app self.chrome_csi: bool = chrome_csi self.chrome_load_times: bool = chrome_load_times self.chrome_runtime: bool = chrome_runtime + self.hairline: bool = hairline self.iframe_content_window: bool = iframe_content_window self.media_codecs: bool = media_codecs self.navigator_hardware_concurrency: int = navigator_hardware_concurrency @@ -106,16 +107,20 @@ def __init__( self.navigator_plugins: bool = navigator_plugins self.navigator_user_agent: bool = navigator_user_agent self.navigator_vendor: bool = navigator_vendor - self.hairline: bool = hairline + self.navigator_webdriver: bool = navigator_webdriver + self.sec_ch_ua: bool = sec_ch_ua + self.webgl_vendor: bool = webgl_vendor - # options - self.webgl_vendor_override: str = webgl_vendor_override - self.webgl_renderer_override: str = webgl_renderer_override - self.navigator_vendor_override: str = navigator_vendor_override - self.navigator_user_agent_override: Optional[str] = navigator_user_agent_override - self.navigator_platform_override: Optional[str] = navigator_platform_override - self.navigator_languages_override: Tuple[str, str] = navigator_languages_override + # evasion options self.chrome_runtime_run_on_insecure_origins: Optional[bool] = chrome_runtime_run_on_insecure_origins + self.navigator_languages_override: Tuple[str, str] = navigator_languages_override + self.navigator_platform_override: Optional[str] = navigator_platform_override + self.navigator_user_agent_override: Optional[str] = navigator_user_agent_override + self.navigator_vendor_override: str = navigator_vendor_override + self.sec_ch_ua_override: Optional[str] = sec_ch_ua_override + self.webgl_renderer_override: str = webgl_renderer_override + self.webgl_vendor_override: str = webgl_vendor_override + # other options self.init_scripts_only: bool = init_scripts_only self.script_logging = script_logging @@ -259,16 +264,7 @@ async def async_hooked_method(*args, **kwargs) -> Union[async_api.Browser, async *args, **self._kwargs_with_patched_cli_arg(method, kwargs, chromium_mode), ) - if isinstance(browser_or_context, async_api.BrowserContext): - context: async_api.BrowserContext = browser_or_context - context.new_page = self._generate_hooked_new_page(context.new_page) - elif isinstance(browser_or_context, async_api.Browser): - browser: async_api.Browser = browser_or_context - browser.new_page = self._generate_hooked_new_page(browser.new_page) - browser.new_context = self._generate_hooked_new_context(browser.new_context) - else: - raise TypeError(f"unexpected type from function (bug): {method.__name__} returned {browser_or_context}") - + self._reassign_new_page_new_context(browser_or_context) return browser_or_context def sync_hooked_method(*args, **kwargs) -> Union[sync_api.Browser, sync_api.BrowserContext]: @@ -276,16 +272,7 @@ def sync_hooked_method(*args, **kwargs) -> Union[sync_api.Browser, sync_api.Brow *args, **self._kwargs_with_patched_cli_arg(method, kwargs, chromium_mode), ) - if isinstance(browser_or_context, sync_api.BrowserContext): - context: sync_api.BrowserContext = browser_or_context - context.new_page = self._generate_hooked_new_page(context.new_page) - elif isinstance(browser_or_context, sync_api.Browser): - browser: sync_api.Browser = browser_or_context - browser.new_page = self._generate_hooked_new_page(browser.new_page) - browser.new_context = self._generate_hooked_new_context(browser.new_context) - else: - raise TypeError(f"unexpected type from function (bug): {method.__name__} returned {browser_or_context}") - + self._reassign_new_page_new_context(browser_or_context) return browser_or_context if inspect.iscoroutinefunction(method): @@ -309,25 +296,146 @@ def hooked_browser_method_sync(*args, **kwargs): def _generate_hooked_new_page(self, new_page_method: Callable) -> Callable: """ - Returns a hooked method (async or sync) for new_page or new_context. + Returns a hooked method (async or sync) for new_page. *args and **kwargs even though these methods may not take any number of arguments, we want to preserve accurate stack traces when caller passes args improperly """ + browser_instance = new_page_method.__self__ + USER_AGENT_OVERRIDE_PIGGYBACK_KEY = "_stealth_user_agent" + SEC_CH_UA_OVERRIDE_PIGGYBACK_KEY = "_stealth_sec_ch_ua" + + async def get_user_agent_and_sec_ch_ua_async(page_method: Callable) -> Tuple[str, str]: + """ + If there's no override, it's Chrome, and we haven't cached a UA value prior, we need to come up + with an accurate, non-headless UA ourselves. It's impossible to get UA without creating a temp page: + https://github.com/microsoft/playwright/issues/31743#issuecomment-2241550377 + We can piggyback on the browser object to cache the UA - this way we don't + + Returns: + user_agent, sec_ch_ua + """ + temp_page: Optional[async_api.Page] + stealth_user_agent = getattr(browser_instance, USER_AGENT_OVERRIDE_PIGGYBACK_KEY) + sec_ch_ua = getattr(browser_instance, SEC_CH_UA_OVERRIDE_PIGGYBACK_KEY) + if stealth_user_agent is None or sec_ch_ua is None: + temp_page = await page_method() + stealth_user_agent = (await temp_page.evaluate("navigator.userAgent")).replace( + "HeadlessChrome", "Chrome" + ) + await temp_page.close(reason="playwright_stealth internal temp utility page") + sec_ch_ua = self._get_greased_chrome_sec_ua_ch(stealth_user_agent) + setattr(browser_instance, SEC_CH_UA_OVERRIDE_PIGGYBACK_KEY, sec_ch_ua) + setattr(browser_instance, USER_AGENT_OVERRIDE_PIGGYBACK_KEY, stealth_user_agent) + return stealth_user_agent, sec_ch_ua + + def get_user_agent_and_sec_ch_ua_sync(page_method: Callable) -> Tuple[str, str]: + temp_page: Optional[sync_api.Page] + stealth_user_agent = getattr(browser_instance, USER_AGENT_OVERRIDE_PIGGYBACK_KEY) + sec_ch_ua = getattr(browser_instance, SEC_CH_UA_OVERRIDE_PIGGYBACK_KEY) + if stealth_user_agent is None or sec_ch_ua is None: + temp_page = page_method() + stealth_user_agent = temp_page.evaluate("navigator.userAgent").replace("HeadlessChrome", "Chrome") + sec_ch_ua = self._get_greased_chrome_sec_ua_ch(stealth_user_agent) + temp_page.close(reason="playwright_stealth internal temp utility page") + setattr(browser_instance, SEC_CH_UA_OVERRIDE_PIGGYBACK_KEY, sec_ch_ua) + setattr(browser_instance, USER_AGENT_OVERRIDE_PIGGYBACK_KEY, stealth_user_agent) + return stealth_user_agent, sec_ch_ua async def hooked_browser_method_async(*args, **kwargs): - page_or_context = await new_page_method(*args, **kwargs) - await self.apply_stealth_async(page_or_context) - return page_or_context + # respect any override the user passes themselves + if self.navigator_user_agent and kwargs.get("user_agent") is None: + user_agent_override = self.navigator_user_agent_override + if user_agent_override is None and browser_instance.browser_type == "chromium": + user_agent_override, _ = await get_user_agent_and_sec_ch_ua_async(new_page_method) + kwargs["user_agent"] = self.navigator_user_agent_override + + extra_http_headers = kwargs.get("extra_http_headers", {}) + # respect any override the user passes themselves + if self.sec_ch_ua and CaseInsensitiveDict(extra_http_headers).get("sec-ch-ua") is None: + sec_ch_ua_override = self.sec_ch_ua_override + if sec_ch_ua_override is None and browser_instance.browser_type == "chromium": + _, sec_ch_ua_override = await get_user_agent_and_sec_ch_ua_async(new_page_method) + if sec_ch_ua_override is not None: + # this could be tricky is a differently cased key of the same thing exists, + # but we have done a case-insensitive check above that precludes this + extra_http_headers["sec-ch-ua"] = sec_ch_ua_override + kwargs["extra_http_headers"] = extra_http_headers + page = await new_page_method(*args, **kwargs) + await self.apply_stealth_async(page) + return page def hooked_browser_method_sync(*args, **kwargs): - page_or_context = new_page_method(*args, **kwargs) - self.apply_stealth_sync(page_or_context) - return page_or_context + if self.navigator_user_agent and kwargs.get("user_agent") is None: + user_agent_override = self.navigator_user_agent_override + if user_agent_override is None and browser_instance.browser_type == "chromium": + user_agent_override, _ = get_user_agent_and_sec_ch_ua_sync(new_page_method) + kwargs["user_agent"] = self.navigator_user_agent_override + + extra_http_headers = kwargs.get("extra_http_headers", {}) + if self.sec_ch_ua and CaseInsensitiveDict(extra_http_headers).get("sec-ch-ua") is None: + sec_ch_ua_override = self.sec_ch_ua_override + # respect any override the user has already made + if sec_ch_ua_override is None and browser_instance.browser_type == "chromium": + _, sec_ch_ua_override = get_user_agent_and_sec_ch_ua_sync(new_page_method) + if sec_ch_ua_override is not None: + # this could be tricky is a differently cased key of the same thing exists, + # but we have done a case-insensitive check above that precludes this + extra_http_headers["sec-ch-ua"] = sec_ch_ua_override + kwargs["extra_http_headers"] = extra_http_headers + page = new_page_method(*args, **kwargs) + self.apply_stealth_sync(page) + return page if inspect.iscoroutinefunction(new_page_method): return hooked_browser_method_async return hooked_browser_method_sync + def _reassign_new_page_new_context( + self, + browser_or_context: Union[ + async_api.BrowserContext, async_api.Browser, sync_api.BrowserContext, sync_api.Browser + ], + ): + if isinstance(browser_or_context, (async_api.BrowserContext, sync_api.BrowserContext)): + context: async_api.BrowserContext = browser_or_context + context.new_page = self._generate_hooked_new_page(context.new_page) + elif isinstance(browser_or_context, (async_api.Browser, sync_api.Browser)): + browser: async_api.Browser = browser_or_context + browser.new_page = self._generate_hooked_new_page(browser.new_page) + browser.new_context = self._generate_hooked_new_context(browser.new_context) + else: + raise TypeError(f"unexpected type from function (bug): returned {browser_or_context}") + + @staticmethod + def _get_greased_chrome_sec_ua_ch(user_agent: str): + """ + From the major version in user_agent, generate a Sec-CH-UA header value. An example of the data in this + header can be generated from navigator.userAgentData.brands (requires secure context). We could query that + ourselves, but since it requires a secure context, there's no performant way to do that, so instead we + re-implement the greasing algorithm from Chrome. + + See Also: + https://wicg.github.io/ua-client-hints/#grease + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Sec-CH-UA + https://source.chromium.org/chromium/chromium/src/+/main:components/embedder_support/user_agent_utils.cc + Args: + user_agent: Chrome UA + + Returns: + greased Sec-CH-UA header value + """ + greased_versions = [8, 99, 24] + greasy_chars = " ():-./;=?_" + greasy_brand = f"Not{random.choice(greasy_chars)}A{random.choice(greasy_chars)}Brand" + version = re.search(r"Chrome/([\d.]+)", user_agent, re.IGNORECASE) + major_version = version.group(1) + brands = [ + ("Chromium", major_version), + ("Chrome", major_version), + (greasy_brand, random.choice(greased_versions)), + ] + return ", ".join(f'"{brand}";v="{version}"' for brand, version in brands) + @staticmethod def _patch_blink_features_cli_args(existing_args: Optional[List[str]]) -> List[str]: """Patches CLI args list to disable AutomationControlled blink feature, while preserving other args""" @@ -372,12 +480,11 @@ def _patch_cli_arg(existing_args: List[str], flag: str) -> List[str]: ALL_EVASIONS_DISABLED_KWARGS = { - "navigator_webdriver": False, - "webgl_vendor": False, "chrome_app": False, "chrome_csi": False, "chrome_load_times": False, "chrome_runtime": False, + "hairline": False, "iframe_content_window": False, "media_codecs": False, "navigator_hardware_concurrency": False, @@ -387,5 +494,7 @@ def _patch_cli_arg(existing_args: List[str], flag: str) -> List[str]: "navigator_plugins": False, "navigator_user_agent": False, "navigator_vendor": False, - "hairline": False, + "navigator_webdriver": False, + "sec_ch_ua": False, + "webgl_vendor": False, }