| from bs4 import BeautifulSoup
|
| import requests
|
| import re
|
|
|
| response = requests.get('https://www.arborstonestorageponcacity.com/blog')
|
| soup = BeautifulSoup(response.content, 'lxml')
|
| regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
|
| "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
|
| "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
|
|
|
| remove_tags = ['noscript', 'style', 'link', 'img', 'input']
|
|
|
| for script_tag in soup.findAll('script'):
|
| if script_tag.has_attr('src'):
|
| script_tag.decompose()
|
|
|
| for tag_type in remove_tags:
|
| for junk_tag in soup.findAll(tag_type):
|
| junk_tag.decompose()
|
| soup_str = str(soup.find('body')).lower()
|
| if soup.find('footer'):
|
| soup_str += str(soup.find('footer')).lower()
|
|
|
|
|
| for match in re.findall(regex, soup_str):
|
| print(match)
|