|
| 1 | +import requests |
| 2 | +import re |
| 3 | +from bs4 import BeautifulSoup |
| 4 | + |
| 5 | +# First method of scrapig, simple and dirty (with requests and regex) |
| 6 | +# # get the data |
| 7 | +# data = requests.get('http://www.mg-cc.org/club-information/club-contacts') |
| 8 | +# |
| 9 | +# #extract the phone numbers, emails and so on |
| 10 | +# |
| 11 | +# phones = re.findall(r'(\(?[0-9]{3}\)?(?:\-|\s|\.)?[0-9]{3}(?:\-|\.)[0-9]{4})', data.text) |
| 12 | +# emails = re.findall(r'([\d\w\.]+@[\d\w\.\-]+\.\w+)', data.text) |
| 13 | +# |
| 14 | +# print(phones, emails) |
| 15 | + |
| 16 | +# Scraping with beautiful soup |
| 17 | + |
| 18 | +data = requests.get('https://raw.githubusercontent.com/engineer-man/youtube/master/042/scrape.html') |
| 19 | + |
| 20 | +# load data into bs4 |
| 21 | +soup = BeautifulSoup(data.text, 'html.parser') |
| 22 | + |
| 23 | +# get data simply by looking for each tr |
| 24 | +scrapped_data = [] |
| 25 | +for tr in soup.find_all('tr'): |
| 26 | + values = [td.text for td in tr.find_all('td')] |
| 27 | + scrapped_data.append(values) |
| 28 | + |
| 29 | +print(scrapped_data) |
| 30 | + |
| 31 | +# get data simply by looking for each tr |
| 32 | +scrapped_data_special = [] |
| 33 | +for tr in soup.find_all('tr', {'class': 'special'}): |
| 34 | + values_special = [td.text for td in tr.find_all('td')] |
| 35 | + scrapped_data_special.append(values_special) |
| 36 | +print(scrapped_data_special) |
| 37 | + |
| 38 | +# get data within specific element |
| 39 | +specific_element = [] |
| 40 | +div = soup.find('div', {'class': 'special_table'}) |
| 41 | +for tr in div.find_all('tr'): |
| 42 | + values_special_class = [td.text for td in tr.find_all('td')] |
| 43 | + specific_element.append(values_special_class) |
| 44 | +print(specific_element) |
| 45 | + |
0 commit comments