
"requests" library supports HTTP2, HTTPS, coockies by default and connection:keep-alive header, which increase the number of positive responses from the probed servers. Nevertheless, due to unreliable nature of the Internet and flaky performance of web servers some links occasionaly fail, and with this many links to probe (1.3k+) it becomes very likely that at least one of them fails. To mitigate these failures, the script retries the failed links once more.
120 lines
4.4 KiB
Python
120 lines
4.4 KiB
Python
import os,io
|
|
import re
|
|
import requests
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from socket import timeout
|
|
import pathlib
|
|
|
|
def show_files(filenames):
|
|
for filename in filenames:
|
|
print(filename)
|
|
|
|
def live_url(url: str, timeout=5):
|
|
if url.startswith('#'):
|
|
return True
|
|
try:
|
|
req = requests.Request('GET', url, headers = {'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3,q=0.9',
|
|
'Sec-Fetch-Site':'none',
|
|
'Sec-Fetch-Mode':'navigate',
|
|
'Sec-Fetch-User':'?1',
|
|
'Sec-Fetch-Dest':'document',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Connection': 'keep-alive'})
|
|
session = requests.Session()
|
|
code = session.send(req.prepare(), timeout=timeout).status_code
|
|
if (code / 100 >= 4):
|
|
print(f"ERROR: {code} Nothing there")
|
|
return False
|
|
else:
|
|
return True
|
|
except requests.ConnectionError as ce:
|
|
print(f"ERROR: Connection error {ce}")
|
|
return False
|
|
except requests.HTTPError as h:
|
|
print(f"ERROR: HTTP error {h}")
|
|
return False
|
|
except requests.URLRequired as e:
|
|
print(f"ERROR: Bad URL: {e}")
|
|
return False
|
|
except requests.TooManyRedirects as rr:
|
|
print(f"ERROR: Too many redirects: {rr}")
|
|
return False
|
|
except requests.Timeout as t:
|
|
print(f"ERROR: timeout ", t)
|
|
return False
|
|
except timeout as t:
|
|
print(f"ERROR: timeout ", t)
|
|
return False
|
|
except Exception as e:
|
|
print(f"ERROR: ", e)
|
|
return False
|
|
|
|
def findurl_in_html(filename,urls):
|
|
with open(filename, 'r', encoding="utf8") as file:
|
|
soup = BeautifulSoup(file,features="html.parser")
|
|
for link in soup.findAll('a'):
|
|
key=link.get('href')
|
|
if key in urls:
|
|
urls[key].append(filename)
|
|
else:
|
|
urls[key]=[filename]
|
|
|
|
def is_active(metadata_fname, generic_metadata_fname):
|
|
try:
|
|
with open(metadata_fname) as metadata_file:
|
|
metadata = json.load(metadata_file)
|
|
if 'status' in metadata:
|
|
return metadata['status'] == 'ready'
|
|
with open(generic_metadata_fname) as generic_metadata_file:
|
|
generic_metdata = json.load(generic_metadata_file)
|
|
if 'status' in generic_metdata:
|
|
return generic_metdata['status'] == 'ready'
|
|
except EnvironmentError:
|
|
return True
|
|
return True
|
|
|
|
def check_html_links(dir):
|
|
urls={}
|
|
errors=[]
|
|
print("Finding links in html files")
|
|
tot_files = 0
|
|
for rulepath in pathlib.Path(dir).iterdir():
|
|
if rulepath.is_dir():
|
|
generic_metadata=rulepath.joinpath('metadata.json')
|
|
for langpath in rulepath.iterdir():
|
|
if langpath.is_dir():
|
|
metadata=langpath.joinpath('metadata.json')
|
|
filepath=langpath.joinpath('rule.html')
|
|
filename=str(filepath.absolute())
|
|
if filepath.exists() and is_active(metadata, generic_metadata):
|
|
tot_files += 1
|
|
findurl_in_html(filename,urls)
|
|
print("All html files crawled")
|
|
print("Testing links")
|
|
for url in urls:
|
|
print(f"{url} in {len(urls[url])} files")
|
|
if not live_url(url, timeout=5):
|
|
errors.append(url)
|
|
if errors:
|
|
confirmed_errors=[]
|
|
print("Retrying failed probes")
|
|
for key in errors:
|
|
print(f"{key} in {len(urls[key])} files (previously failed)")
|
|
if not live_url(key, timeout=15):
|
|
confirmed_errors.append(key)
|
|
if confirmed_errors:
|
|
print("There were errors")
|
|
for key in confirmed_errors:
|
|
print(f"{key} in:")
|
|
show_files(urls[key])
|
|
print(f"{len(confirmed_errors)}/{len(urls)} links are dead, see the list and related files before")
|
|
exit(1)
|
|
print(f"All {len(urls)} links are good")
|
|
|