
Mark Chesney
11,669 Pointssharing my code: web crawling
Hi, here's the code solution he encouraged us to share with the community. Hope it works for everyone!
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('https://treehouse-projects.github.io/horse-land/index.html')
soup = BeautifulSoup(html.read(), 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
site_links = []
# Here's a function that will receive a website's internal link and parse the html on that page.
def internal_links(linkURL):
html = urlopen('https://treehouse-projects.github.io/horse-land/{}'.format(linkURL))
soup = BeautifulSoup(html, 'html.parser')
return soup.find('a', href=re.compile('(.html)$')) # anchor tags
if __name__ == '__main__':
urls = internal_links("index.html")
while len(urls) > 0:
page = urls.attrs['href']
if page not in site_links:
site_links.append(page)
print(page)
print('\n=============\n')
urls = internal_links(page)
else:
break