Source code for AutomateTheBoringStuff.Ch11.Projects.P4_linkVerification

"""Link verification

Write a program that, given the URL of a web page, will attempt to download
every linked page on the page. The program should flag any pages that have a
404 “Not Found” status code and print them out as broken links.

"""


[docs]def main():
    import requests, bs4, os
    from urllib.request import urlretrieve

    # Fetch page
    res = requests.get("http://JoseALerma.com")
    res.raise_for_status()  # raise error if nothing fetched

    soup = bs4.BeautifulSoup(res.text, "lxml")

    # Parse page for all links
    anchors = soup.find_all('a')
    links = []

    for anchor in anchors:
        link = anchor.get("href")
        if str(link).startswith("http"):
            links.append(link)

    # Add code 404 pages
    links.append("http://JoseALerma.com/potato")
    links.append("http://JoseALerma.com/carrot")

    # Download every linked page
    os.makedirs("pages", exist_ok=True)  # Save in ./pages

    for link in links:
        try:
            res = requests.head(link)  # Only fetch head tag for speed
            if res.status_code == 404:
                # Print code 404 pages
                print("Page not found: %s" % link)
            else:
                filepath = os.path.join("pages", os.path.basename(link + ".html"))
                urlretrieve(link, filepath)
        except requests.exceptions.ConnectionError:
            print("Unable to connect to: %s" % link)


if __name__ == '__main__':
    main()