import pandas as pd
import numpy as np
df = pd.DataFrame({'id': [1000, 2000, 3000, 4000, 5000],
'name': ["Marufuku", "Jack in the Box", "Thai Basil", "Tako Sushi", "McDonald's"],
'cuisine': ["Japanese", "Fast Food", "Thai", "Japanese", "Fast Food"],
'orders': [2241, 1592, 820, 1739, 1039],
'rating': [4.8, 4.1, 3.9, 4.2, 4.5]})
df
df.index # default index, does not always have to be this
df.columns
df.shape
.loc
vs. .iloc
¶df2 = df.set_index('id')
df2
df
df.loc[1000] # errors because 1000 is not in the index
df2.loc[1000]
type(df2.loc[1000]) # we'll come back to what a Series is later
df.iloc[3]
df2.iloc[3] # this is the same as the above because both dataframes have the same restaurant in the row at index 3
pd.Series
¶s = df['name']
s
type(s) # each column is a Series
s.index # a Series has an index
s.shape # a Series also has a shape
s.iloc[2:4]
s.loc[2:4]
s.columns # a Series is NOT a DataFrame!
df
df['cuisine'] == "Japanese" # returns what is called a "boolean array"
df[ df['cuisine'] == "Japanese" ]
df.loc[ "cuisine" == "Japanese" ] # common mistake
df[ (df['cuisine'] == "Japanese") & (df['rating'] >= 4.5) ] # can use more than 1 condition
df[ df['cuisine'] == "Japanese" & df['rating'] >= 4.5 ] # parantheses matter!
df
df.groupby('cuisine') # this creates an intermediate object which has the groups within it
df.groupby('cuisine').agg(np.mean) # notice that np.mean is used on ALL numerical columns
df.groupby('cuisine')['rating'].agg(np.mean) # np.mean is only used on the rating column
df.groupby('cuisine')[['rating']].agg(np.mean) # if you want it as a df instead
YOUR TURN: Find the lowest rating within each cuisine.
df.groupby('cuisine')[['rating']].agg(np.min)
Find the restaurant with the lowest rating within each cuisine
# Implementation 1: Lambda Function
df.sort_values("rating").groupby('cuisine')['name'].agg(lambda group: group.iloc[0])
# Implementation 2: .first() aggregator
df.sort_values("rating").groupby('cuisine')['name'].first()
left = pd.DataFrame({
'id':[1,2,3,4,5],
'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'shirt_color':['blue','blue','red','blue','green'],
'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
{'id':[1,2,3,4,5],
'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'shirt_color':['red','blue','red','red','blue'],
'subject_id':['sub2','sub4','sub3','sub6','sub5']})
left
right
pd.merge(left, right, on='id')
pd.merge(left, right, on=['id', 'shirt_color'])
pd.merge(left, right, on='shirt_color')
pd.merge(left, right, on='shirt_color', how='inner')
pd.merge(left, right, on='subject_id', how='left')
pd.merge(left, right, on='shirt_color', how='left')
pd.merge(left, right, on='shirt_color', how='right')
pd.merge(left, right, on='Name', how='outer')
Any pivot_table
operation has a corresponding groupby
operation that gives the same output, but the pivot_table
displays the output in a slightly different format. Specifically, pivot_table
places one (or more) of the keys you group by as columns in the output DataFrame, whereas groupby
places all of the keys you group by in the index of the output DataFrame. In some cases, pivot_table
and groupby
can give the exact same output.
data = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
"bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two",
"one", "one", "two", "two"],
"C": ["small", "large", "large", "small",
"small", "large", "small", "small",
"large"],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
data
pd.pivot_table(data, values='D', index=['A', 'B'],
columns=['C'], aggfunc=np.sum, fill_value=0)
data.groupby(['A', 'B', 'C'])[['D']].agg(np.sum)
pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
aggfunc={'D': np.mean,
'E': np.mean})
data.groupby(["A", "C"])[["D", "E"]].agg(np.mean)
pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
aggfunc={'D': np.mean,
'E': [min, max, np.mean]})
data.groupby(["A", "C"])[["D", "E"]].agg({'D': np.mean,
'E': [min, max, np.mean]})