
import matplotlib.pyplot as plt
import math
import re
from bs4 import BeautifulSoup
import requests

test = pd.read_html("https://www.espn.com/nba/stats/player/_/table/offensive/sort/avgPoints/dir/desc")
# Webscraping NBA data, requests.get will set a specific url
page = requests.get("https://www.espn.com/nba/stats/player/_/table/offensive/sort/avgPoints/dir/desc")
# BeautifulSoup is a good python library for webscraping 
soup = BeautifulSoup(page.text,"html.parser")
all_tr = soup.find_all('tr') # find all table row elements
all_tr
type(all_tr) # class of a python object
len(all_tr) # length
soup.find_all('tr',attrs = {'class':"Table__TR Table__TR--sm Table__even", 'data-idx':"49"}) # Find all table row elements with specific attributes

playerLine = soup.find_all('tr',attrs = {'class':"Table__TR Table__TR--sm Table__even", 'data-idx':"49"})
type(playerLine) # class of a python object
len(playerLine) # length
player = playerLine[0].find('a')
print(player.get_text())

stats = playerLine[1].find_all('td') # find all table data elements
for data in stats:
  print(data.get_text())

header = soup.find_all('tr',attrs = {'class':"Table__sub-header Table__TR Table__even"}) # for column variable names
header
statLocs = header[1].find_all('a')
for names in statLocs:
  print(names.get_text())

statNames = [element.get_text() for element in statLocs]
statNames
type(statNames)
len(statNames)

finalHeader = ['NAME','POS','GP',statNames]
finalHeader # good? NO!
finalHeader = ['NAME', 'POS', 'GP'] + statNames
finalHeader # good! :)
NBAdat = pd.DataFrame(columns = finalHeader)
type(NBAdat)
NBAdat
NBAdat.shape

for id in range(50):
  playerLine = soup.find_all('tr',attrs = {'class':"Table__TR Table__TR--sm Table__even", 'data-idx':str(id)})
  player = playerLine[0].find('a').get_text()
  statsLine = playerLine[1].find_all('td')
  stats = [element.get_text() for element in statsLine]
  NBAdat.loc[id] = [player] + stats

NBAdat.shape # dimensions
len(NBAdat) # number of rows
NBAdat.head(5) # head
NBAdat['POS'] # view a column by title
NBAdat.loc[:,'POS'] # same as above
NBAdat.loc[:,1] # error!
NBAdat.iloc[:,1]
NBAdat.iloc[1,:] # view a row
NBAdat[1] # bad
NBAdat.loc[1] # good
NBAdat.iloc[0:3,0:3] # submatrix view


# summarize the data in fun ways
NBAdat[statNames] = NBAdat[statNames].apply(pd.to_numeric)
posGroup = NBAdat.groupby('POS')
posGroup.get_group('PG')
posGroup.groups.keys()
posGroup['NAME'].count()
posGroup['3PA'].mean()

fig = plt.figure()
plt.scatter(NBAdat['3P%'],NBAdat['3PA'])
fig.savefig('plot.png')



