Source code for AutomateTheBoringStuff.Ch11.P2_parseHTML

"""Parse HTML

This program uses :py:mod:`requests` to fetch an HTML page and :py:mod:`bs4` to parse it.

"""


[docs]def main(): import requests, bs4 # Creating a BeautifulSoup Object from HTML res = requests.get("http://nostarch.com") res.raise_for_status() # Raise error if nothing fetched noStarchSoup = bs4.BeautifulSoup(res.text, "lxml") # Specify parser to avoid warning print(type(noStarchSoup)) exampleFile = open("example.html") exampleSoup = bs4.BeautifulSoup(exampleFile, "lxml") print(type(exampleSoup)) # Finding an Element with the select() Method exampleFile = open("example.html") exampleSoup = bs4.BeautifulSoup(exampleFile.read(), "lxml") elems = exampleSoup.select("#author") print(type(elems)) print(len(elems)) print(type(elems[0])) print(elems[0].getText()) print(str(elems[0])) print(elems[0].attrs) pElems = exampleSoup.select('p') print(str(pElems[0])) print(pElems[0].getText()) print(str(pElems[1])) print(pElems[1].getText()) print(str(pElems[2])) print(pElems[2].getText()) # Getting Data from an Element's Attributes soup = bs4.BeautifulSoup(open("example.html"), "lxml") spanElem = soup.select('span')[0] print(str(spanElem)) print(spanElem.get("id")) print(spanElem.get("some_nonexistent_addr") is None) print(spanElem.attrs)
if __name__ == '__main__': main()