import requests
from bs4 import BeautifulSoup
import datetime
import random
import re
from urllib.request import urljoin
random.seed(datetime.datetime.now())
def get_links(article_url):
html = 'https://en.wikipedia.org'
url = urljoin(html, article_url)
print(url)
wiki_html = requests.get(url)
bs = BeautifulSoup(wiki_html.text)
return bs.find('div', {'id': 'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
links = get_links('/wiki/Kevin_Bacon')
if __name__ == '__main__':
while len(links) > 0:
new_articles = links[random.randint(0, len(links) - 1)].attrs['href']
links = get_links(new_articles)
print('lenth of links is %s' % len(links))