Difference between revisions of "Python Web Scraping"
Jump to navigation
Jump to search
(Created page with "==Web Scraping== <pre> # pip install bs4, requests, pandas # install them one at a time import requests from bs4 import BeautifulSoup import pandas as pd nj = 'https://forecas...") |
|||
Line 91: | Line 91: | ||
</pre> | </pre> | ||
==[[#top|Back To Top]] - [[Python|Category]]== |
Revision as of 13:39, 13 March 2020
Web Scraping
# pip install bs4, requests, pandas # install them one at a time import requests from bs4 import BeautifulSoup import pandas as pd nj = 'https://forecast.weather.gov/MapClick.php?lat=40.89165000000003&lon=-74.04688499999997#.XgvA5xdKhUQ' alaska = 'https://forecast.weather.gov/MapClick.php?lat=64.0003&lon=-150.0003#.XgvO1BdKhUQ' page = requests.get(alaska) soup = BeautifulSoup(page.content, 'html.parser') # print(soup.find_all('a')) # find all a tags week = soup.find(id='seven-day-forecast-body') items = (week.find_all(class_='tombstone-container')) #print(items[0]) item1 = items[0].find(class_='period-name').get_text() item2 = items[0].find(class_='short-desc').get_text() item3 = items[0].find(class_='temp').get_text() #print('Weather: ' + item1 + ' - ' + item2 + ' - ' + item3) period_names = [item.find(class_='period-name').get_text() for item in items] short_desc = [item.find(class_='short-desc').get_text() for item in items] temp = [item.find(class_='temp').get_text() for item in items] #print(period_names) #print(short_desc) #print(temp) weather_stuff = pd.DataFrame( { 'period': period_names, 'short_description': short_desc, 'temperatures': temp } ) print(weather_stuff) weather_stuff.to_csv('alaska.csv') weather_stuff.to_html('alaska.html')
# pip install bs4, requests, pandas, lxml # install them one at a time import requests from bs4 import BeautifulSoup import pandas as pd import lxml import csv import sys reload(sys) sys.setdefaultencoding('utf8') source = requests.get('http://coreyms.com').text soup = BeautifulSoup(source, 'lxml') csv_file = open('web_scrape.csv', 'w') csv_writer = csv.writer(csv_file) csv_writer.writerow(['headline', 'summary', 'vidsource']) for article in soup.find_all('article'): headline = article.h2.a.text summary = article.find('div', class_='entry-content').p.text #get the value of the source try: vidsource = article.find('iframe')['src'] except TypeError: vidsource= '* No video' #parsing out part of a string # eg: http://youtube.com/embed/12345-7o?version=3&rel=1&fs #vid_id = vidsource.split('/)[4] # this would split the url using the / and the 4th split is the utube id # vid_id = vid_id.split('?')[0] # put the link together, the "f" means formatted: youtube prefix url is always the same # yt_link = f'https://youtube.com/watch?v={vid_id}' #print(article.prettify()) print(headline) print(summary) print(vidsource) print('--------------------------') csv_writer.writerow([headline, summary, vidsource]) csv_file.close()