import os,io import re import requests import json from bs4 import BeautifulSoup from socket import timeout import pathlib def show_files(filenames): for filename in filenames: print(filename) def live_url(url: str, timeout=5): if url.startswith('#'): return True try: req = requests.Request('GET', url, headers = {'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90"', 'sec-ch-ua-mobile': '?0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3,q=0.9', 'Sec-Fetch-Site':'none', 'Sec-Fetch-Mode':'navigate', 'Sec-Fetch-User':'?1', 'Sec-Fetch-Dest':'document', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive'}) session = requests.Session() code = session.send(req.prepare(), timeout=timeout).status_code if (code / 100 >= 4): print(f"ERROR: {code} Nothing there") return False else: return True except requests.ConnectionError as ce: print(f"ERROR: Connection error {ce}") return False except requests.HTTPError as h: print(f"ERROR: HTTP error {h}") return False except requests.URLRequired as e: print(f"ERROR: Bad URL: {e}") return False except requests.TooManyRedirects as rr: print(f"ERROR: Too many redirects: {rr}") return False except requests.Timeout as t: print(f"ERROR: timeout ", t) return False except timeout as t: print(f"ERROR: timeout ", t) return False except Exception as e: print(f"ERROR: ", e) return False def findurl_in_html(filename,urls): with open(filename, 'r', encoding="utf8") as file: soup = BeautifulSoup(file,features="html.parser") for link in soup.findAll('a'): key=link.get('href') if key in urls: urls[key].append(filename) else: urls[key]=[filename] def is_active(metadata_fname, generic_metadata_fname): try: with open(metadata_fname) as metadata_file: metadata = json.load(metadata_file) if 'status' in metadata: return metadata['status'] == 'ready' with open(generic_metadata_fname) as generic_metadata_file: generic_metdata = json.load(generic_metadata_file) if 'status' in generic_metdata: return generic_metdata['status'] == 'ready' except EnvironmentError: return True return True def check_html_links(dir): urls={} errors=[] print("Finding links in html files") tot_files = 0 for rulepath in pathlib.Path(dir).iterdir(): if rulepath.is_dir(): generic_metadata=rulepath.joinpath('metadata.json') for langpath in rulepath.iterdir(): if langpath.is_dir(): metadata=langpath.joinpath('metadata.json') filepath=langpath.joinpath('rule.html') filename=str(filepath.absolute()) if filepath.exists() and is_active(metadata, generic_metadata): tot_files += 1 findurl_in_html(filename,urls) print("All html files crawled") print("Testing links") for url in urls: print(f"{url} in {len(urls[url])} files") if not live_url(url, timeout=5): errors.append(url) if errors: confirmed_errors=[] print(f"Retrying {len(errors)} failed probes") for key in errors: print(f"{key} in {len(urls[key])} files (previously failed)") if not live_url(key, timeout=15): confirmed_errors.append(key) if confirmed_errors: print("There were errors") for key in confirmed_errors: print(f"{key} in:") show_files(urls[key]) print(f"{len(confirmed_errors)}/{len(urls)} links are dead, see the list and related files before") exit(1) print(f"All {len(urls)} links are good")