Skip to content

Commit

Permalink
Fixed it -.-
Browse files Browse the repository at this point in the history
  • Loading branch information
wfxey committed Dec 7, 2024
1 parent d60bc88 commit 025ae9a
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 39 deletions.
32 changes: 2 additions & 30 deletions app/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,19 @@
from .database import save_webpage, save_resource
from .utils import sanitize_domain_or_url as sanitize_domain

ALLOWED_DOMAINS = ["example.com", "another-allowed-domain.com"]

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../web_archives'))

AUTHORIZED_DOMAINS = ["example.com", "anotherdomain.com"]

def fetch_and_store_page(domain, url, base_url, visited_urls):
if url in visited_urls:
logger.debug(f"Skipping already visited URL: {url}")
return

visited_urls.add(url)

parsed_url = urlparse(url)
if parsed_url.netloc not in AUTHORIZED_DOMAINS:
logger.error(f"Unauthorized domain: {parsed_url.netloc}")
return

try:
if not is_url_allowed(url):
logger.error(f"URL not allowed: {url}")
return

response = requests.get(url, timeout=10)
if response.status_code != 200:
logger.error(f"Failed to fetch page: {url} - Status {response.status_code}")
Expand All @@ -43,9 +30,7 @@ def fetch_and_store_page(domain, url, base_url, visited_urls):
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
soup = BeautifulSoup(response.text, 'html.parser')

resource_folder = os.path.normpath(os.path.join(BASE_DIR, sanitize_domain(domain), page_uuid))
if not resource_folder.startswith(BASE_DIR):
raise Exception("Invalid resource folder path")
resource_folder = os.path.join(BASE_DIR, sanitize_domain(domain), page_uuid)
os.makedirs(resource_folder, exist_ok=True)

for tag in soup.find_all(['link', 'script', 'img']):
Expand Down Expand Up @@ -75,18 +60,10 @@ def process_resource(tag, base_url, resource_folder, domain, page_uuid, timestam
full_resource_url = urljoin(base_url, resource_url)

try:
if not is_url_allowed(full_resource_url):
logger.error(f"Resource URL not allowed: {full_resource_url}")
return

res = requests.get(full_resource_url, timeout=10)
if res.status_code == 200:
resource_filename = generate_unique_filename(full_resource_url)
resource_path = os.path.normpath(os.path.join(resource_folder, resource_filename))

if not resource_path.startswith(resource_folder):
logger.error(f"Invalid resource path: {resource_path}")
return
resource_path = os.path.join(resource_folder, resource_filename)

with open(resource_path, 'wb') as file:
file.write(res.content)
Expand All @@ -104,11 +81,6 @@ def process_resource(tag, base_url, resource_folder, domain, page_uuid, timestam
except requests.RequestException as e:
logger.error(f"Failed to fetch resource: {full_resource_url} - {str(e)}")

def is_url_allowed(url):
"""Check if the URL is within the allowed domains."""
parsed_url = urlparse(url)
return any(parsed_url.netloc.endswith(domain) for domain in ALLOWED_DOMAINS)

def generate_unique_filename(resource_url):
"""Generate a unique filename for the resource to avoid overwriting."""
resource_name = os.path.basename(urlparse(resource_url).path)
Expand Down
11 changes: 2 additions & 9 deletions app/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,7 @@ def view_page():
return jsonify({"error": "Full URL is required"}), 400

sanitized = sanitize_domain_or_url(domain_or_url)
domain_db_path = os.path.normpath(os.path.join(BASE_DIR, f'{sanitized}.db'))

if not domain_db_path.startswith(BASE_DIR):
logging.error(f"Attempted access to a path outside the base directory: {domain_db_path}")
return jsonify({"error": "Invalid path"}), 400
domain_db_path = os.path.join(BASE_DIR, f'{sanitized}.db')

if not os.path.exists(domain_db_path):
logging.error(f"No database found for URL: {domain_or_url}")
Expand Down Expand Up @@ -94,10 +90,7 @@ def view_version():
return jsonify({"error": "URL is required"}), 400

sanitized = sanitize_domain_or_url(domain_or_url)
domain_db_path = os.path.normpath(os.path.join(BASE_DIR, f'{sanitized}.db'))
if not domain_db_path.startswith(BASE_DIR):
logging.error(f"Attempted access to a path outside the base directory: {domain_db_path}")
return jsonify({"error": "Invalid path"}), 400
domain_db_path = os.path.join(BASE_DIR, f'{sanitized}.db')

if not os.path.exists(domain_db_path):
logging.error(f"No database found for URL: {domain_or_url}")
Expand Down

0 comments on commit 025ae9a

Please sign in to comment.