import pandas as pd
import numpy as np
df = pd.DataFrame({'id': [1000, 2000, 3000, 4000, 5000],
'name': ["Marufuku", "Jack in the Box", "Thai Basil", "Tako Sushi", "McDonald's"],
'cuisine': ["Japanese", "Fast Food", "Thai", "Japanese", "Fast Food"],
'orders': [2241, 1592, 820, 1739, 1039],
'rating': [4.8, 4.1, 3.9, 4.2, 4.5]})
df
df.index # default index, does not always have to be this
df.columns
df.shape
.loc vs. .iloc¶df2 = df.set_index('id')
df2
df
df.loc[1000] # errors because 1000 is not in the index
df2.loc[1000]
type(df2.loc[1000]) # we'll come back to what a Series is later
df.iloc[3]
df2.iloc[3] # this is the same as the above because both dataframes have the same restaurant in the row at index 3
pd.Series¶s = df['name']
s
type(s) # each column is a Series
s.index # a Series has an index
s.shape # a Series also has a shape
s.iloc[2:4]
s.loc[2:4]
s.columns # a Series is NOT a DataFrame!
df
df['cuisine'] == "Japanese" # returns what is called a "boolean array"
df[ df['cuisine'] == "Japanese" ]
df.loc[ "cuisine" == "Japanese" ] # common mistake
df[ (df['cuisine'] == "Japanese") & (df['rating'] >= 4.5) ] # can use more than 1 condition
df[ df['cuisine'] == "Japanese" & df['rating'] >= 4.5 ] # parantheses matter!
df
df.groupby('cuisine') # this creates an intermediate object which has the groups within it
df.groupby('cuisine').agg(np.mean) # notice that np.mean is used on ALL numerical columns
df.groupby('cuisine')['rating'].agg(np.mean) # np.mean is only used on the rating column
df.groupby('cuisine')[['rating']].agg(np.mean) # if you want it as a df instead
YOUR TURN: Find the lowest rating within each cuisine.
df.groupby('cuisine')[['rating']].agg(np.min)
Find the restaurant with the lowest rating within each cuisine
# Implementation 1: Lambda Function
df.sort_values("rating").groupby('cuisine')['name'].agg(lambda group: group.iloc[0])
# Implementation 2: .first() aggregator
df.sort_values("rating").groupby('cuisine')['name'].first()
left = pd.DataFrame({
'id':[1,2,3,4,5],
'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'shirt_color':['blue','blue','red','blue','green'],
'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
{'id':[1,2,3,4,5],
'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'shirt_color':['red','blue','red','red','blue'],
'subject_id':['sub2','sub4','sub3','sub6','sub5']})
left
right
pd.merge(left, right, on='id')
pd.merge(left, right, on=['id', 'shirt_color'])
pd.merge(left, right, on='shirt_color')
pd.merge(left, right, on='shirt_color', how='inner')
pd.merge(left, right, on='subject_id', how='left')
pd.merge(left, right, on='shirt_color', how='left')
pd.merge(left, right, on='shirt_color', how='right')
pd.merge(left, right, on='Name', how='outer')
Any pivot_table operation has a corresponding groupby operation that gives the same output, but the pivot_table displays the output in a slightly different format. Specifically, pivot_table places one (or more) of the keys you group by as columns in the output DataFrame, whereas groupby places all of the keys you group by in the index of the output DataFrame. In some cases, pivot_table and groupby can give the exact same output.
data = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
"bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two",
"one", "one", "two", "two"],
"C": ["small", "large", "large", "small",
"small", "large", "small", "small",
"large"],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
data
pd.pivot_table(data, values='D', index=['A', 'B'],
columns=['C'], aggfunc=np.sum, fill_value=0)
data.groupby(['A', 'B', 'C'])[['D']].agg(np.sum)
pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
aggfunc={'D': np.mean,
'E': np.mean})
data.groupby(["A", "C"])[["D", "E"]].agg(np.mean)
pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
aggfunc={'D': np.mean,
'E': [min, max, np.mean]})
data.groupby(["A", "C"])[["D", "E"]].agg({'D': np.mean,
'E': [min, max, np.mean]})