python - Beautiful Soup Strip html caption tags, th class tags and retrieve data not in list -


so have created web scraper goes cfbstats.com/2014/player/index.html , retrieves college football teams , links of football teams. there goes each link , takes roster , players link. goes each players link , takes stats.

i having problem taking players stats. when call header of each table printed output [tackle] , when call first row of table [g]. rid of tags. have been able not have them past functions. appreciated.

import csv import sys import json import urllib import requests bs4 import beautifulsoup import xlrd import xlwt       def getcollegeandurl():        f = open('colleges.csv', 'w')      f.write("teams" + "," + "," + "url" + '\n')     originalurl = "http://www.cfbstats.com/2014/player/index.html"     base = requests.get("http://www.cfbstats.com/2014/player/index.html")     base = base.text     soup = beautifulsoup(base)         # find colleges in div conference    mydivs = soup.find_all('div',{'class': 'conference'})      ##g excel document roster    g = open('rosters.csv', 'w')    g.write("college rosters" + '\n' + '\n' + 'college' + ',' + 'playernumber' + ',' + 'player last name' + ',' +'player first name' + ',' + 'position' + ',' + 'year' + ',' + 'height' + ',' + ' weight' + ',' +'hometown' + ',' + 'state' + ',' + 'last school' + ',' + '\n')     # h excel each player stats    h = xlwt.workbook()     # loop finds writes each college line    div in mydivs:         urls= div.findall('a')       # pull college names , each of links         url in urls:               college = url.text             url = url.attrs['href']              teamurl = originalurl[:23]+url              f.write(college[:]+ ',' + ',' + teamurl[:]+'\n')               scraperosters(college, teamurl, g, h)        ############################################################################  def scraperosters(college, teamurl, g, h):      # create excel documents     # gets pages of teams     roster = requests.get(teamurl)     roster = roster.text     roster = beautifulsoup(roster)           teamname = roster.find_all('h1' , {'id': 'pagetitle'})      teamandplayers = {}     table = roster.find_all('table', {'class' : 'team-roster'})        in table:          rows = i.find_all('tr')            row in rows[1:]:              data = [str(i.gettext()) in row('td')]             link = row('td')[1]('a')                  if len(link) > 0:                 link = str(link[0]['href'])                 data = [str(link)] + data                  # unpacking data variables                  (playerurl, playernumber, playername,    playerposition,yearincollege, playerheight, playerweight, playerhometown, lastschool) = data                  # creating full player url                 playerurl = teamurl[:23] + playerurl                  # repacking data                  data = (college, playernumber, playername, playerposition,yearincollege, playerheight, playerweight, playerhometown, lastschool)                      g.write(college + ',' + playernumber + ',' + playername + ',' + playerposition + ','+ yearincollege + ',' + playerheight + ',' + playerweight + ',' + playerhometown + ',' + lastschool+ ',' + ',' + playerurl + ',' + '\n')                    playerstats(data, playerurl, h)          ############################################################################ def playerstats(data,playerurl, h):      playerurl = requests.get(playerurl)     playerurl = playerurl.text     playerurl = beautifulsoup(playerurl)        tablestats = playerurl.find_all('table', {'class' : 'player-home'})       (college, playernumber, playername, playerposition,yearincollege, playerheight, playerweight, playerhometown, lastschool) = data      #print college, playernumber, playername      print college, playername, playernumber      x in tablestats:          caption = x.find_all('caption')                rows = x.find_all('tr')  ##        caption = caption.strip          row in rows:             headers = x.find_all('th')              headers = [str(i.gettext()) in row('tr')]              stats = [str(x.gettext()) x in row('td')]              print caption, headers, stats   ############################################################################ def main():     getcollegeandurl()    main()       

don't work hard, data already available in parseable form.


Comments

Popular posts from this blog

shopping cart - Page redirect not working PHP -

php - How to modify a menu to show sub-menus -

python - Installing PyDev in eclipse is failed -