import os,io
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.request import urlopen,Request
from urllib.error import URLError,HTTPError
from socket import timeout
import pathlib

def show_files(filenames):
  for filename in filenames:
    print(filename)

def live_url(url):
  code=None
  req = Request(
    url, 
    data=None, 
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    }    
  )
  try:
    code = urlopen(req,timeout=5).code  
    if (code / 100 >= 4):
      print(f"ERROR: {code} Nothing there")
      return False
    else:
      return True
  except HTTPError as h:
    print(f"ERROR: {h.code} {h.reason}")
    return False
  except URLError as e:
    print(f"ERROR: {e.reason}")
    return False
  except ConnectionError as c:
    print(f"ERROR: connection error {c}")
    return False
  except timeout as t:
    print(f"ERROR: timeout ", t)
    return False
  except Exception as e:
    print(f"ERROR: ", e)
    return False

def findurl_in_html(filename,urls):    
  with open(filename, 'r', encoding="utf8") as file:
    soup = BeautifulSoup(file,features="html.parser")
    for link in soup.findAll('a'):
      key=link.get('href')
      if key in urls:
        urls[key].append(filename)
      else:
        urls[key]=[filename]

def check_html_links(dir):  
  urls={}
  errors=[]
  print("Finding links in html files")
  for filepath in pathlib.Path(dir).glob('**/*.html'):
    filename=str(filepath.absolute())
    findurl_in_html(filename,urls)
  print("All html files crawled")
  print("Testing links")
  for url in urls:
    print(f"{url} in {len(urls[url])} files")
    if not live_url(url):
      errors.append(url)
  if errors:
    print("There were errors")
    for key in errors:
      print(f"{key} in:") 
      show_files(urls[key])
    print(f"{len(errors)}/{len(urls)} links are dead, see the list and related files before")
    exit(1)
  else:
    print(f"All {len(urls)} links are good")