Source code for AutomateTheBoringStuff.Ch15.Projects.P2_scheduledComicDownloader

"""Scheduled comic downloader

Write a program that checks the websites of several web comics and automatically
downloads the images if the comic was updated since the program’s last visit.

Your operating system’s scheduler (Scheduled Tasks on Windows, launchd on OS X,
and cron on Linux) can run your Python program once a day.

The Python program itself can download the comic and then copy it to your desktop
so that it is easy to find. This will free you from having to check the website
yourself to see whether it has updated.

Note:
    This only downloads from http://www.lefthandedtoons.com/ and
    http://buttersafe.com/ because all websites are different.

"""

import os, requests, bs4, datetime, shelve, re


[docs]def get_soup(url_arg: str) -> bs4.BeautifulSoup: """Get soup Downloads given url with :py:mod:`requests` and converts it to :class:`bs4.BeautifulSoup`. Args: url_arg: String with url to soupify. Returns: BeautifulSoup object of given url. Raises: requests.exceptions.HTTPError: If download of website url failed. """ print(f'Downloading page {url_arg}...') res = requests.get(url_arg) res.raise_for_status() return bs4.BeautifulSoup(res.text, 'lxml')
[docs]def compare_timestamps(timestamp_arg: str, shelf_arg: shelve.open, url_arg: str) -> bool: """Compare timestamps Compares timestamp of current comic to last downloaded comic timestamp of given url. Args: timestamp_arg: String with date in ``Month DD, YYYY`` format. shelf_arg: :py:mod:`shelve` object with urls as keys and :meth:`datetime.datetime.date` as values. url_arg: String with website url. Returns: True if comic's timestamp is after saved timestamp, False otherwise. """ comic_date = datetime.datetime.strptime(timestamp_arg, '%B %d, %Y').date() shelf_date = shelf_arg[url_arg] if comic_date > shelf_date: # New comic available, download comic return True return False
[docs]def check_key(shelf_arg: shelve.open, url_arg: str) -> bool: """Check key Checks if given url is a key in the given shelf. Args: shelf_arg: :py:mod:`shelve` object with urls as keys and :meth:`datetime.datetime.date` as values. url_arg: String with website url. Returns: True if the url is in the shelf, False otherwise. """ keys = shelf_arg.keys if url_arg in keys: return True return False
[docs]def save_comic(comic_url_arg: str, shelf_arg: shelve.open, url_arg: str) -> None: """Save comic Downloads given comic url and saves to desktop, then updates download time of given website url in given shelf. Args: comic_url_arg: String with url of comic image. shelf_arg: :py:mod:`shelve` object with urls as keys and :meth:`datetime.datetime.date` as values. url_arg: String with website url. Returns: None. Comic image is saved to desktop. Raises: requests.exceptions.HTTPError: If download of comic url failed. """ print(f'Downloading image {comic_url_arg}...') comic_res = requests.get(comic_url_arg) comic_res.raise_for_status() # Save the comic to desktop. image_file = open(os.path.join(os.path.expanduser('~/Desktop'), os.path.basename(comic_url_arg)), "wb") for chunk in comic_res.iter_content(100000): image_file.write(chunk) image_file.close() now = datetime.datetime.now().date() shelf_arg[url_arg] = now return None
[docs]def main(): comic_shelf = shelve.open('comic') # Download page url = 'http://www.lefthandedtoons.com/' soup = get_soup(url) # Get comic url in case it needs to be downloaded image_elem = soup.select('.comicimage') comic_url = image_elem[0].get('src') # Compare page's timestamp to shelve's comic_title_elem = soup.select('.comictitlearea') if not comic_title_elem: print('Could not find comic timestamp.') else: title_text = comic_title_elem[0].getText() match = re.search('\w+ \d+, \d{4}', title_text) if not check_key(comic_shelf, url) or compare_timestamps(match.group(), comic_shelf, url): save_comic(comic_url, comic_shelf, url) # Download page url = 'http://buttersafe.com/' soup = get_soup(url) # Get comic url in case it needs to be downloaded div_elem = soup.find('div', attrs={'id': 'comic'}) comic_url = div_elem.find('img')['src'] # Compare page's timestamp to shelve's comic_header = soup.select('#headernav-date') if not comic_header: print('Could not find comic timestamp.') else: header_text = comic_header[0].getText() match = re.search('(\w+), (\w+) (\d+).., (\d{4})', header_text) comic_timestamp = f'{match.group(2)} {match.group(3)}, {match.group(4)}' if not check_key(comic_shelf, url) or compare_timestamps(comic_timestamp, comic_shelf, url): save_comic(comic_url, comic_shelf, url) comic_shelf.close()
if __name__ == '__main__': main()