python - Beautiful Soup Strip html caption tags, th class tags and retrieve data not in list -
so have created web scraper goes cfbstats.com/2014/player/index.html , retrieves college football teams , links of football teams. there goes each link , takes roster , players link. goes each players link , takes stats.
i having problem taking players stats. when call header of each table printed output [tackle] , when call first row of table [g]. rid of tags. have been able not have them past functions. appreciated.
import csv import sys import json import urllib import requests bs4 import beautifulsoup import xlrd import xlwt def getcollegeandurl(): f = open('colleges.csv', 'w') f.write("teams" + "," + "," + "url" + '\n') originalurl = "http://www.cfbstats.com/2014/player/index.html" base = requests.get("http://www.cfbstats.com/2014/player/index.html") base = base.text soup = beautifulsoup(base) # find colleges in div conference mydivs = soup.find_all('div',{'class': 'conference'}) ##g excel document roster g = open('rosters.csv', 'w') g.write("college rosters" + '\n' + '\n' + 'college' + ',' + 'playernumber' + ',' + 'player last name' + ',' +'player first name' + ',' + 'position' + ',' + 'year' + ',' + 'height' + ',' + ' weight' + ',' +'hometown' + ',' + 'state' + ',' + 'last school' + ',' + '\n') # h excel each player stats h = xlwt.workbook() # loop finds writes each college line div in mydivs: urls= div.findall('a') # pull college names , each of links url in urls: college = url.text url = url.attrs['href'] teamurl = originalurl[:23]+url f.write(college[:]+ ',' + ',' + teamurl[:]+'\n') scraperosters(college, teamurl, g, h) ############################################################################ def scraperosters(college, teamurl, g, h): # create excel documents # gets pages of teams roster = requests.get(teamurl) roster = roster.text roster = beautifulsoup(roster) teamname = roster.find_all('h1' , {'id': 'pagetitle'}) teamandplayers = {} table = roster.find_all('table', {'class' : 'team-roster'}) in table: rows = i.find_all('tr') row in rows[1:]: data = [str(i.gettext()) in row('td')] link = row('td')[1]('a') if len(link) > 0: link = str(link[0]['href']) data = [str(link)] + data # unpacking data variables (playerurl, playernumber, playername, playerposition,yearincollege, playerheight, playerweight, playerhometown, lastschool) = data # creating full player url playerurl = teamurl[:23] + playerurl # repacking data data = (college, playernumber, playername, playerposition,yearincollege, playerheight, playerweight, playerhometown, lastschool) g.write(college + ',' + playernumber + ',' + playername + ',' + playerposition + ','+ yearincollege + ',' + playerheight + ',' + playerweight + ',' + playerhometown + ',' + lastschool+ ',' + ',' + playerurl + ',' + '\n') playerstats(data, playerurl, h) ############################################################################ def playerstats(data,playerurl, h): playerurl = requests.get(playerurl) playerurl = playerurl.text playerurl = beautifulsoup(playerurl) tablestats = playerurl.find_all('table', {'class' : 'player-home'}) (college, playernumber, playername, playerposition,yearincollege, playerheight, playerweight, playerhometown, lastschool) = data #print college, playernumber, playername print college, playername, playernumber x in tablestats: caption = x.find_all('caption') rows = x.find_all('tr') ## caption = caption.strip row in rows: headers = x.find_all('th') headers = [str(i.gettext()) in row('tr')] stats = [str(x.gettext()) x in row('td')] print caption, headers, stats ############################################################################ def main(): getcollegeandurl() main()
don't work hard, data already available in parseable form.
Comments
Post a Comment