rspec/rspec-tools/rspec_tools/checklinks.py
Arseniy Zaostrovnykh b021d34a09 Use "requests" instead of urllib; improve stability of link probing
"requests" library supports HTTP2, HTTPS, coockies by default and
connection:keep-alive header, which increase the number of positive responses
from the probed servers.

Nevertheless, due to unreliable nature of the Internet and flaky performance of
web servers some links occasionaly fail, and with this many links to
probe (1.3k+) it becomes very likely that at least one of them fails.
To mitigate these failures, the script retries the failed links once more.
2021-05-06 11:01:40 +02:00

120 lines
4.4 KiB
Python

import os,io
import re
import requests
import json
from bs4 import BeautifulSoup
from socket import timeout
import pathlib
def show_files(filenames):
for filename in filenames:
print(filename)
def live_url(url: str, timeout=5):
if url.startswith('#'):
return True
try:
req = requests.Request('GET', url, headers = {'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3,q=0.9',
'Sec-Fetch-Site':'none',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-User':'?1',
'Sec-Fetch-Dest':'document',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive'})
session = requests.Session()
code = session.send(req.prepare(), timeout=timeout).status_code
if (code / 100 >= 4):
print(f"ERROR: {code} Nothing there")
return False
else:
return True
except requests.ConnectionError as ce:
print(f"ERROR: Connection error {ce}")
return False
except requests.HTTPError as h:
print(f"ERROR: HTTP error {h}")
return False
except requests.URLRequired as e:
print(f"ERROR: Bad URL: {e}")
return False
except requests.TooManyRedirects as rr:
print(f"ERROR: Too many redirects: {rr}")
return False
except requests.Timeout as t:
print(f"ERROR: timeout ", t)
return False
except timeout as t:
print(f"ERROR: timeout ", t)
return False
except Exception as e:
print(f"ERROR: ", e)
return False
def findurl_in_html(filename,urls):
with open(filename, 'r', encoding="utf8") as file:
soup = BeautifulSoup(file,features="html.parser")
for link in soup.findAll('a'):
key=link.get('href')
if key in urls:
urls[key].append(filename)
else:
urls[key]=[filename]
def is_active(metadata_fname, generic_metadata_fname):
try:
with open(metadata_fname) as metadata_file:
metadata = json.load(metadata_file)
if 'status' in metadata:
return metadata['status'] == 'ready'
with open(generic_metadata_fname) as generic_metadata_file:
generic_metdata = json.load(generic_metadata_file)
if 'status' in generic_metdata:
return generic_metdata['status'] == 'ready'
except EnvironmentError:
return True
return True
def check_html_links(dir):
urls={}
errors=[]
print("Finding links in html files")
tot_files = 0
for rulepath in pathlib.Path(dir).iterdir():
if rulepath.is_dir():
generic_metadata=rulepath.joinpath('metadata.json')
for langpath in rulepath.iterdir():
if langpath.is_dir():
metadata=langpath.joinpath('metadata.json')
filepath=langpath.joinpath('rule.html')
filename=str(filepath.absolute())
if filepath.exists() and is_active(metadata, generic_metadata):
tot_files += 1
findurl_in_html(filename,urls)
print("All html files crawled")
print("Testing links")
for url in urls:
print(f"{url} in {len(urls[url])} files")
if not live_url(url, timeout=5):
errors.append(url)
if errors:
confirmed_errors=[]
print("Retrying failed probes")
for key in errors:
print(f"{key} in {len(urls[key])} files (previously failed)")
if not live_url(key, timeout=15):
confirmed_errors.append(key)
if confirmed_errors:
print("There were errors")
for key in confirmed_errors:
print(f"{key} in:")
show_files(urls[key])
print(f"{len(confirmed_errors)}/{len(urls)} links are dead, see the list and related files before")
exit(1)
print(f"All {len(urls)} links are good")