2024-11-20 17:57:55 +01:00
|
|
|
import datetime
|
2021-04-29 17:58:21 +02:00
|
|
|
import json
|
2024-11-20 17:57:55 +01:00
|
|
|
import pathlib
|
2021-06-03 16:04:36 +02:00
|
|
|
import random
|
2024-11-20 17:57:55 +01:00
|
|
|
import socket
|
|
|
|
|
|
|
|
import requests
|
2021-02-12 15:18:24 +01:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
2021-06-03 16:04:36 +02:00
|
|
|
TOLERABLE_LINK_DOWNTIME = datetime.timedelta(days=7)
|
|
|
|
LINK_PROBES_HISTORY_FILE = './link_probes.history'
|
2024-08-14 09:11:44 +02:00
|
|
|
PROBING_COOLDOWN = datetime.timedelta(days=2)
|
|
|
|
PROBING_SPREAD = 60 * 24 # in minutes, 1 day
|
2021-06-03 16:04:36 +02:00
|
|
|
link_probes_history = {}
|
2021-02-12 15:18:24 +01:00
|
|
|
|
2021-05-06 15:35:56 +02:00
|
|
|
# These links consistently fail in CI, but work-on-my-machine
|
2024-08-14 09:20:58 +02:00
|
|
|
EXCEPTION_PREFIXES = [
|
|
|
|
# It seems the server certificate was renewed on 2nd of August 2024.
|
|
|
|
# The server is sending only its certificate, without including the
|
|
|
|
# Intermediate certificate used to issue the server cert. Because of that
|
|
|
|
# some application are not able to verify the complete chain of trust.
|
|
|
|
"https://wiki.sei.cmu.edu/",
|
2024-08-14 11:17:21 +02:00
|
|
|
# The CI reports 403 on drupal.org while it works locally.
|
|
|
|
# Maybe the CI's IP is blocklisted...
|
|
|
|
"https://www.drupal.org/",
|
2023-09-22 16:41:56 +02:00
|
|
|
]
|
2021-05-06 15:35:56 +02:00
|
|
|
|
2021-06-03 16:04:36 +02:00
|
|
|
def show_files(filenames):
|
|
|
|
for filename in filenames:
|
|
|
|
print(filename)
|
|
|
|
|
|
|
|
def load_url_probing_history():
|
|
|
|
global link_probes_history
|
|
|
|
try:
|
|
|
|
with open(LINK_PROBES_HISTORY_FILE, 'r') as link_probes_history_stream:
|
|
|
|
print('Using the historical url-probe results from ' + LINK_PROBES_HISTORY_FILE)
|
|
|
|
link_probes_history = eval(link_probes_history_stream.read())
|
|
|
|
except Exception as e:
|
|
|
|
# If the history file is not present, ignore, will create one in the end.
|
|
|
|
print(f"Failed to load historical url-probe results: {e}")
|
|
|
|
|
|
|
|
def save_url_probing_history():
|
|
|
|
global link_probes_history
|
|
|
|
with open(LINK_PROBES_HISTORY_FILE, 'w') as link_probes_history_stream:
|
|
|
|
link_probes_history_stream.write(str(link_probes_history))
|
|
|
|
|
|
|
|
def rejuvenate_url(url: str):
|
|
|
|
global link_probes_history
|
|
|
|
link_probes_history[url] = datetime.datetime.now()
|
|
|
|
|
|
|
|
def url_is_long_dead(url: str):
|
|
|
|
global link_probes_history
|
|
|
|
if url not in link_probes_history:
|
|
|
|
return True
|
|
|
|
last_time_up = link_probes_history[url]
|
|
|
|
print(f"{url} was reached most recently on {last_time_up}")
|
|
|
|
return TOLERABLE_LINK_DOWNTIME < (datetime.datetime.now() - last_time_up)
|
|
|
|
|
|
|
|
def url_was_reached_recently(url: str):
|
|
|
|
global link_probes_history
|
|
|
|
if url not in link_probes_history:
|
|
|
|
return False
|
|
|
|
last_time_up = link_probes_history[url]
|
|
|
|
spread = random.randrange(PROBING_SPREAD)
|
|
|
|
probing_cooldown = PROBING_COOLDOWN + datetime.timedelta(minutes=spread)
|
|
|
|
return (datetime.datetime.now() - last_time_up) < probing_cooldown
|
|
|
|
|
2021-05-06 10:48:00 +02:00
|
|
|
def live_url(url: str, timeout=5):
|
2021-02-25 11:00:50 +01:00
|
|
|
if url.startswith('#'):
|
|
|
|
return True
|
2021-02-12 15:18:24 +01:00
|
|
|
try:
|
2021-05-06 10:48:00 +02:00
|
|
|
req = requests.Request('GET', url, headers = {'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90"',
|
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
|
'Upgrade-Insecure-Requests': '1',
|
2024-11-20 17:57:55 +01:00
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 GLS/100.10.9939.100',
|
2022-08-29 19:17:24 +02:00
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
2021-05-06 10:48:00 +02:00
|
|
|
'Sec-Fetch-Site':'none',
|
|
|
|
'Sec-Fetch-Mode':'navigate',
|
|
|
|
'Sec-Fetch-User':'?1',
|
|
|
|
'Sec-Fetch-Dest':'document',
|
|
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
|
|
'Connection': 'keep-alive'})
|
|
|
|
session = requests.Session()
|
|
|
|
code = session.send(req.prepare(), timeout=timeout).status_code
|
2021-02-12 15:18:24 +01:00
|
|
|
if (code / 100 >= 4):
|
|
|
|
print(f"ERROR: {code} Nothing there")
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return True
|
2021-05-06 10:48:00 +02:00
|
|
|
except requests.ConnectionError as ce:
|
|
|
|
print(f"ERROR: Connection error {ce}")
|
|
|
|
return False
|
|
|
|
except requests.HTTPError as h:
|
|
|
|
print(f"ERROR: HTTP error {h}")
|
|
|
|
return False
|
|
|
|
except requests.URLRequired as e:
|
|
|
|
print(f"ERROR: Bad URL: {e}")
|
2021-02-12 15:18:24 +01:00
|
|
|
return False
|
2021-05-06 10:48:00 +02:00
|
|
|
except requests.TooManyRedirects as rr:
|
|
|
|
print(f"ERROR: Too many redirects: {rr}")
|
2021-02-12 15:18:24 +01:00
|
|
|
return False
|
2021-05-06 10:48:00 +02:00
|
|
|
except requests.Timeout as t:
|
2024-11-20 17:57:55 +01:00
|
|
|
print(f"ERROR: Request timeout {t}")
|
2021-02-12 15:18:24 +01:00
|
|
|
return False
|
2024-11-20 17:57:55 +01:00
|
|
|
except socket.timeout as t:
|
|
|
|
print(f"ERROR: Socket timeout {t}")
|
2021-02-12 15:18:24 +01:00
|
|
|
return False
|
|
|
|
except Exception as e:
|
2024-11-20 17:57:55 +01:00
|
|
|
print(f"ERROR: {e}")
|
2021-02-12 15:18:24 +01:00
|
|
|
return False
|
|
|
|
|
2021-06-03 16:04:36 +02:00
|
|
|
def findurl_in_html(filename,urls):
|
2021-02-12 15:18:24 +01:00
|
|
|
with open(filename, 'r', encoding="utf8") as file:
|
|
|
|
soup = BeautifulSoup(file,features="html.parser")
|
|
|
|
for link in soup.findAll('a'):
|
|
|
|
key=link.get('href')
|
|
|
|
if key in urls:
|
|
|
|
urls[key].append(filename)
|
|
|
|
else:
|
|
|
|
urls[key]=[filename]
|
|
|
|
|
2021-04-29 17:58:21 +02:00
|
|
|
def is_active(metadata_fname, generic_metadata_fname):
|
|
|
|
try:
|
|
|
|
with open(metadata_fname) as metadata_file:
|
|
|
|
metadata = json.load(metadata_file)
|
|
|
|
if 'status' in metadata:
|
|
|
|
return metadata['status'] == 'ready'
|
|
|
|
with open(generic_metadata_fname) as generic_metadata_file:
|
|
|
|
generic_metdata = json.load(generic_metadata_file)
|
|
|
|
if 'status' in generic_metdata:
|
|
|
|
return generic_metdata['status'] == 'ready'
|
|
|
|
except EnvironmentError:
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
|
2022-01-13 09:25:17 +01:00
|
|
|
def get_all_links_from_htmls(dir):
|
2021-02-12 15:18:24 +01:00
|
|
|
print("Finding links in html files")
|
2022-01-13 09:25:17 +01:00
|
|
|
urls={}
|
2021-04-29 17:58:21 +02:00
|
|
|
for rulepath in pathlib.Path(dir).iterdir():
|
2022-01-13 09:25:17 +01:00
|
|
|
if not rulepath.is_dir():
|
|
|
|
continue
|
|
|
|
generic_metadata=rulepath.joinpath('metadata.json')
|
|
|
|
for langpath in rulepath.iterdir():
|
|
|
|
if not langpath.is_dir():
|
|
|
|
continue
|
|
|
|
metadata=langpath.joinpath('metadata.json')
|
|
|
|
filepath=langpath.joinpath('rule.html')
|
|
|
|
filename=str(filepath.absolute())
|
|
|
|
if filepath.exists() and is_active(metadata, generic_metadata):
|
|
|
|
findurl_in_html(filename,urls)
|
2021-02-12 15:18:24 +01:00
|
|
|
print("All html files crawled")
|
2022-01-13 09:25:17 +01:00
|
|
|
return urls
|
|
|
|
|
2024-08-14 09:20:58 +02:00
|
|
|
def url_is_exception(url: str) -> bool:
|
|
|
|
return any(
|
|
|
|
url.startswith(e) for e in EXCEPTION_PREFIXES
|
|
|
|
)
|
|
|
|
|
|
|
|
def probe_links(urls: dict) -> bool:
|
2022-01-13 09:25:17 +01:00
|
|
|
errors = []
|
2024-08-14 09:20:19 +02:00
|
|
|
link_cache_exception = 0
|
|
|
|
link_cache_hit = 0
|
|
|
|
link_cache_miss = 0
|
2021-02-12 15:18:24 +01:00
|
|
|
print("Testing links")
|
2024-08-14 09:20:19 +02:00
|
|
|
link_count = len(urls)
|
|
|
|
for idx, url in enumerate(urls):
|
|
|
|
print(f"[{idx+1}/{link_count}] {url} in {len(urls[url])} files")
|
2024-08-14 09:20:58 +02:00
|
|
|
if url_is_exception(url):
|
2024-08-14 09:20:19 +02:00
|
|
|
link_cache_exception += 1
|
2021-06-03 16:04:36 +02:00
|
|
|
print("skip as an exception")
|
|
|
|
elif url_was_reached_recently(url):
|
2024-08-14 09:20:19 +02:00
|
|
|
link_cache_hit += 1
|
2021-06-03 16:04:36 +02:00
|
|
|
print("skip probing because it was reached recently")
|
|
|
|
elif live_url(url, timeout=5):
|
2024-08-14 09:20:19 +02:00
|
|
|
link_cache_miss += 1
|
2021-06-03 16:04:36 +02:00
|
|
|
rejuvenate_url(url)
|
|
|
|
elif url_is_long_dead(url):
|
2024-08-14 09:20:19 +02:00
|
|
|
link_cache_miss += 1
|
2021-02-12 15:18:24 +01:00
|
|
|
errors.append(url)
|
2024-08-14 09:20:19 +02:00
|
|
|
else:
|
|
|
|
link_cache_miss += 1
|
|
|
|
|
|
|
|
confirmed_errors = confirm_errors(errors, urls)
|
|
|
|
|
|
|
|
print(f"\n\n\n{'=' * 80}\n\n\n")
|
|
|
|
if confirmed_errors:
|
|
|
|
report_errors(confirmed_errors, urls)
|
|
|
|
print(f"{len(confirmed_errors)}/{len(urls)} links are dead, see above ^^ the list and the related files\n\n")
|
|
|
|
print("Cache statistics:")
|
|
|
|
print(f"\t{link_cache_hit=}")
|
|
|
|
print(f"\t{link_cache_miss=}")
|
|
|
|
link_cache_hit_ratio = (link_cache_hit) / (link_cache_hit + link_cache_miss)
|
|
|
|
print(f"\t{link_cache_hit_ratio:03.2%} hits")
|
|
|
|
print(f"\t{link_cache_exception=}")
|
|
|
|
print(f"\n\n\n{'=' * 80}\n\n\n")
|
|
|
|
|
|
|
|
success = len(confirmed_errors) == 0
|
|
|
|
return success
|
2022-01-13 09:25:17 +01:00
|
|
|
|
|
|
|
def confirm_errors(presumed_errors, urls):
|
|
|
|
confirmed_errors = []
|
|
|
|
print(f"Retrying {len(presumed_errors)} failed probes")
|
|
|
|
for key in presumed_errors:
|
|
|
|
print(f"{key} in {len(urls[key])} files (previously failed)")
|
|
|
|
if not live_url(key, timeout=15):
|
|
|
|
confirmed_errors.append(key)
|
|
|
|
else:
|
|
|
|
rejuvenate_url(key)
|
|
|
|
return confirmed_errors
|
|
|
|
|
|
|
|
def report_errors(errors, urls):
|
|
|
|
print("There were errors")
|
|
|
|
for key in errors:
|
|
|
|
print(f"{key} in:")
|
|
|
|
show_files(urls[key])
|
|
|
|
|
|
|
|
def check_html_links(dir):
|
|
|
|
load_url_probing_history()
|
|
|
|
urls = get_all_links_from_htmls(dir)
|
2024-08-14 09:20:19 +02:00
|
|
|
success = probe_links(urls)
|
|
|
|
if success:
|
2022-06-08 13:52:21 +02:00
|
|
|
print(f"All {len(urls)} links are good")
|
2021-06-03 16:04:36 +02:00
|
|
|
save_url_probing_history()
|
2024-08-14 09:20:19 +02:00
|
|
|
exit(0 if success else 1)
|
2021-02-12 15:18:24 +01:00
|
|
|
|