In [1]:
import pandas as pd
import numpy as np
In [2]:
df = pd.DataFrame({'id': [1000, 2000, 3000, 4000, 5000],
                   'name': ["Marufuku", "Jack in the Box", "Thai Basil", "Tako Sushi", "McDonald's"],
                   'cuisine': ["Japanese", "Fast Food", "Thai", "Japanese", "Fast Food"],
                   'orders': [2241, 1592, 820, 1739, 1039],
                   'rating': [4.8, 4.1, 3.9, 4.2, 4.5]})
In [3]:
df
Out[3]:
id name cuisine orders rating
0 1000 Marufuku Japanese 2241 4.8
1 2000 Jack in the Box Fast Food 1592 4.1
2 3000 Thai Basil Thai 820 3.9
3 4000 Tako Sushi Japanese 1739 4.2
4 5000 McDonald's Fast Food 1039 4.5

Anatomy of a DataFrame

In [4]:
df.index  # default index, does not always have to be this
Out[4]:
RangeIndex(start=0, stop=5, step=1)
In [5]:
df.columns
Out[5]:
Index(['id', 'name', 'cuisine', 'orders', 'rating'], dtype='object')
In [6]:
df.shape
Out[6]:
(5, 5)

Referring to Certain Parts of a DataFrame

Accessing particular rows: .loc vs. .iloc

In [7]:
df2 = df.set_index('id')
In [8]:
df2
Out[8]:
name cuisine orders rating
id
1000 Marufuku Japanese 2241 4.8
2000 Jack in the Box Fast Food 1592 4.1
3000 Thai Basil Thai 820 3.9
4000 Tako Sushi Japanese 1739 4.2
5000 McDonald's Fast Food 1039 4.5
In [9]:
df
Out[9]:
id name cuisine orders rating
0 1000 Marufuku Japanese 2241 4.8
1 2000 Jack in the Box Fast Food 1592 4.1
2 3000 Thai Basil Thai 820 3.9
3 4000 Tako Sushi Japanese 1739 4.2
4 5000 McDonald's Fast Food 1039 4.5
In [10]:
df.loc[1000]  # errors because 1000 is not in the index
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    350                 try:
--> 351                     return self._range.index(new_key)
    352                 except ValueError as err:

ValueError: 1000 is not in range

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-10-cc82ac2fae9d> in <module>
----> 1 df.loc[1000]  # errors because 1000 is not in the index

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    877 
    878             maybe_callable = com.apply_if_callable(key, self.obj)
--> 879             return self._getitem_axis(maybe_callable, axis=axis)
    880 
    881     def _is_scalar_access(self, key: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1108         # fall thru to straight lookup
   1109         self._validate_key(key, axis)
-> 1110         return self._get_label(key, axis=axis)
   1111 
   1112     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1057     def _get_label(self, label, axis: int):
   1058         # GH#5667 this will fail if the label is not present in the axis.
-> 1059         return self.obj.xs(label, axis=axis)
   1060 
   1061     def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3480             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3481         else:
-> 3482             loc = self.index.get_loc(key)
   3483 
   3484             if isinstance(loc, np.ndarray):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    351                     return self._range.index(new_key)
    352                 except ValueError as err:
--> 353                     raise KeyError(key) from err
    354             raise KeyError(key)
    355         return super().get_loc(key, method=method, tolerance=tolerance)

KeyError: 1000
In [11]:
df2.loc[1000]
Out[11]:
name       Marufuku
cuisine    Japanese
orders         2241
rating          4.8
Name: 1000, dtype: object
In [12]:
type(df2.loc[1000])  # we'll come back to what a Series is later
Out[12]:
pandas.core.series.Series
In [13]:
df.iloc[3]
Out[13]:
id               4000
name       Tako Sushi
cuisine      Japanese
orders           1739
rating            4.2
Name: 3, dtype: object
In [14]:
df2.iloc[3]  # this is the same as the above because both dataframes have the same restaurant in the row at index 3
Out[14]:
name       Tako Sushi
cuisine      Japanese
orders           1739
rating            4.2
Name: 4000, dtype: object

Accessing particular columns: pd.Series

In [15]:
s = df['name']
s
Out[15]:
0           Marufuku
1    Jack in the Box
2         Thai Basil
3         Tako Sushi
4         McDonald's
Name: name, dtype: object
In [16]:
type(s)  # each column is a Series
Out[16]:
pandas.core.series.Series
In [17]:
s.index  # a Series has an index
Out[17]:
RangeIndex(start=0, stop=5, step=1)
In [18]:
s.shape  # a Series also has a shape
Out[18]:
(5,)
In [19]:
s.iloc[2:4]
Out[19]:
2    Thai Basil
3    Tako Sushi
Name: name, dtype: object
In [20]:
s.loc[2:4]
Out[20]:
2    Thai Basil
3    Tako Sushi
4    McDonald's
Name: name, dtype: object
In [21]:
s.columns  # a Series is NOT a DataFrame!
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-21-27a1b26bb635> in <module>
----> 1 s.columns  # a Series is NOT a DataFrame!

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5128             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5129                 return self[name]
-> 5130             return object.__getattribute__(self, name)
   5131 
   5132     def __setattr__(self, name: str, value) -> None:

AttributeError: 'Series' object has no attribute 'columns'

Accessing particular rows: boolean array indexing

In [22]:
df
Out[22]:
id name cuisine orders rating
0 1000 Marufuku Japanese 2241 4.8
1 2000 Jack in the Box Fast Food 1592 4.1
2 3000 Thai Basil Thai 820 3.9
3 4000 Tako Sushi Japanese 1739 4.2
4 5000 McDonald's Fast Food 1039 4.5
In [23]:
df['cuisine'] == "Japanese"  # returns what is called a "boolean array"
Out[23]:
0     True
1    False
2    False
3     True
4    False
Name: cuisine, dtype: bool
In [24]:
df[ df['cuisine'] == "Japanese" ]
Out[24]:
id name cuisine orders rating
0 1000 Marufuku Japanese 2241 4.8
3 4000 Tako Sushi Japanese 1739 4.2
In [25]:
df.loc[ "cuisine" == "Japanese" ]  # common mistake
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-25-21e239a7d4d1> in <module>
----> 1 df.loc[ "cuisine" == "Japanese" ]  # common mistake

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    877 
    878             maybe_callable = com.apply_if_callable(key, self.obj)
--> 879             return self._getitem_axis(maybe_callable, axis=axis)
    880 
    881     def _is_scalar_access(self, key: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1108         # fall thru to straight lookup
   1109         self._validate_key(key, axis)
-> 1110         return self._get_label(key, axis=axis)
   1111 
   1112     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1057     def _get_label(self, label, axis: int):
   1058         # GH#5667 this will fail if the label is not present in the axis.
-> 1059         return self.obj.xs(label, axis=axis)
   1060 
   1061     def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3480             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3481         else:
-> 3482             loc = self.index.get_loc(key)
   3483 
   3484             if isinstance(loc, np.ndarray):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    352                 except ValueError as err:
    353                     raise KeyError(key) from err
--> 354             raise KeyError(key)
    355         return super().get_loc(key, method=method, tolerance=tolerance)
    356 

KeyError: False
In [26]:
df[ (df['cuisine'] == "Japanese") & (df['rating'] >= 4.5) ]  # can use more than 1 condition
Out[26]:
id name cuisine orders rating
0 1000 Marufuku Japanese 2241 4.8
In [27]:
df[ df['cuisine'] == "Japanese" & df['rating'] >= 4.5 ]  # parantheses matter!
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_logical_op(x, y, op)
    260         #  (xint or xbool) and (yint or bool)
--> 261         result = op(x, y)
    262     except TypeError:

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/roperator.py in rand_(left, right)
     51 def rand_(left, right):
---> 52     return operator.and_(right, left)
     53 

TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_logical_op(x, y, op)
    274             try:
--> 275                 result = libops.scalar_binop(x, y, op)
    276             except (

pandas/_libs/ops.pyx in pandas._libs.ops.scalar_binop()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'double'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
<ipython-input-27-6e45bcf433c0> in <module>
----> 1 df[ df['cuisine'] == "Japanese" & df['rating'] >= 4.5 ]  # parantheses matter!

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/common.py in new_method(self, other)
     63         other = item_from_zerodim(other)
     64 
---> 65         return method(self, other)
     66 
     67     return new_method

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/__init__.py in wrapper(self, other)
    392         rvalues = extract_array(other, extract_numpy=True)
    393 
--> 394         res_values = logical_op(lvalues, rvalues, op)
    395         return self._construct_result(res_values, name=res_name)
    396 

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in logical_op(left, right, op)
    349         filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool
    350 
--> 351         res_values = na_logical_op(lvalues, rvalues, op)
    352         res_values = filler(res_values)  # type: ignore
    353 

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_logical_op(x, y, op)
    285                     f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
    286                     f"and scalar of type [{typ}]"
--> 287                 ) from err
    288 
    289     return result.reshape(x.shape)

TypeError: Cannot perform 'rand_' with a dtyped [float64] array and scalar of type [bool]

Group By Operations in Pandas

In [28]:
df
Out[28]:
id name cuisine orders rating
0 1000 Marufuku Japanese 2241 4.8
1 2000 Jack in the Box Fast Food 1592 4.1
2 3000 Thai Basil Thai 820 3.9
3 4000 Tako Sushi Japanese 1739 4.2
4 5000 McDonald's Fast Food 1039 4.5
In [29]:
df.groupby('cuisine')  # this creates an intermediate object which has the groups within it
Out[29]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ee3e850>
In [30]:
df.groupby('cuisine').agg(np.mean)  # notice that np.mean is used on ALL numerical columns 
Out[30]:
id orders rating
cuisine
Fast Food 3500.0 1315.5 4.3
Japanese 2500.0 1990.0 4.5
Thai 3000.0 820.0 3.9
In [31]:
df.groupby('cuisine')['rating'].agg(np.mean)  # np.mean is only used on the rating column
Out[31]:
cuisine
Fast Food    4.3
Japanese     4.5
Thai         3.9
Name: rating, dtype: float64
In [32]:
df.groupby('cuisine')[['rating']].agg(np.mean)  # if you want it as a df instead
Out[32]:
rating
cuisine
Fast Food 4.3
Japanese 4.5
Thai 3.9

YOUR TURN: Find the lowest rating within each cuisine.

In [33]:
df.groupby('cuisine')[['rating']].agg(np.min)
Out[33]:
rating
cuisine
Fast Food 4.1
Japanese 4.2
Thai 3.9

Find the restaurant with the lowest rating within each cuisine

In [34]:
# Implementation 1: Lambda Function
df.sort_values("rating").groupby('cuisine')['name'].agg(lambda group: group.iloc[0])
Out[34]:
cuisine
Fast Food    Jack in the Box
Japanese          Tako Sushi
Thai              Thai Basil
Name: name, dtype: object
In [35]:
# Implementation 2: .first() aggregator
df.sort_values("rating").groupby('cuisine')['name'].first()
Out[35]:
cuisine
Fast Food    Jack in the Box
Japanese          Tako Sushi
Thai              Thai Basil
Name: name, dtype: object

Joins

In [36]:
left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'shirt_color':['blue','blue','red','blue','green'], 
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
   {'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'shirt_color':['red','blue','red','red','blue'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})
In [37]:
left
Out[37]:
id Name shirt_color subject_id
0 1 Alex blue sub1
1 2 Amy blue sub2
2 3 Allen red sub4
3 4 Alice blue sub6
4 5 Ayoung green sub5
In [38]:
right
Out[38]:
id Name shirt_color subject_id
0 1 Billy red sub2
1 2 Brian blue sub4
2 3 Bran red sub3
3 4 Bryce red sub6
4 5 Betty blue sub5
In [39]:
pd.merge(left, right, on='id')
Out[39]:
id Name_x shirt_color_x subject_id_x Name_y shirt_color_y subject_id_y
0 1 Alex blue sub1 Billy red sub2
1 2 Amy blue sub2 Brian blue sub4
2 3 Allen red sub4 Bran red sub3
3 4 Alice blue sub6 Bryce red sub6
4 5 Ayoung green sub5 Betty blue sub5
In [40]:
pd.merge(left, right, on=['id', 'shirt_color'])
Out[40]:
id Name_x shirt_color subject_id_x Name_y subject_id_y
0 2 Amy blue sub2 Brian sub4
1 3 Allen red sub4 Bran sub3
In [41]:
pd.merge(left, right, on='shirt_color')
Out[41]:
id_x Name_x shirt_color subject_id_x id_y Name_y subject_id_y
0 1 Alex blue sub1 2 Brian sub4
1 1 Alex blue sub1 5 Betty sub5
2 2 Amy blue sub2 2 Brian sub4
3 2 Amy blue sub2 5 Betty sub5
4 4 Alice blue sub6 2 Brian sub4
5 4 Alice blue sub6 5 Betty sub5
6 3 Allen red sub4 1 Billy sub2
7 3 Allen red sub4 3 Bran sub3
8 3 Allen red sub4 4 Bryce sub6
In [42]:
pd.merge(left, right, on='shirt_color', how='inner')
Out[42]:
id_x Name_x shirt_color subject_id_x id_y Name_y subject_id_y
0 1 Alex blue sub1 2 Brian sub4
1 1 Alex blue sub1 5 Betty sub5
2 2 Amy blue sub2 2 Brian sub4
3 2 Amy blue sub2 5 Betty sub5
4 4 Alice blue sub6 2 Brian sub4
5 4 Alice blue sub6 5 Betty sub5
6 3 Allen red sub4 1 Billy sub2
7 3 Allen red sub4 3 Bran sub3
8 3 Allen red sub4 4 Bryce sub6
In [43]:
pd.merge(left, right, on='subject_id', how='left')
Out[43]:
id_x Name_x shirt_color_x subject_id id_y Name_y shirt_color_y
0 1 Alex blue sub1 NaN NaN NaN
1 2 Amy blue sub2 1.0 Billy red
2 3 Allen red sub4 2.0 Brian blue
3 4 Alice blue sub6 4.0 Bryce red
4 5 Ayoung green sub5 5.0 Betty blue
In [44]:
pd.merge(left, right, on='shirt_color', how='left')
Out[44]:
id_x Name_x shirt_color subject_id_x id_y Name_y subject_id_y
0 1 Alex blue sub1 2.0 Brian sub4
1 1 Alex blue sub1 5.0 Betty sub5
2 2 Amy blue sub2 2.0 Brian sub4
3 2 Amy blue sub2 5.0 Betty sub5
4 3 Allen red sub4 1.0 Billy sub2
5 3 Allen red sub4 3.0 Bran sub3
6 3 Allen red sub4 4.0 Bryce sub6
7 4 Alice blue sub6 2.0 Brian sub4
8 4 Alice blue sub6 5.0 Betty sub5
9 5 Ayoung green sub5 NaN NaN NaN
In [45]:
pd.merge(left, right, on='shirt_color', how='right')
Out[45]:
id_x Name_x shirt_color subject_id_x id_y Name_y subject_id_y
0 3 Allen red sub4 1 Billy sub2
1 3 Allen red sub4 3 Bran sub3
2 3 Allen red sub4 4 Bryce sub6
3 1 Alex blue sub1 2 Brian sub4
4 2 Amy blue sub2 2 Brian sub4
5 4 Alice blue sub6 2 Brian sub4
6 1 Alex blue sub1 5 Betty sub5
7 2 Amy blue sub2 5 Betty sub5
8 4 Alice blue sub6 5 Betty sub5
In [46]:
pd.merge(left, right, on='Name', how='outer')
Out[46]:
id_x Name shirt_color_x subject_id_x id_y shirt_color_y subject_id_y
0 1.0 Alex blue sub1 NaN NaN NaN
1 2.0 Amy blue sub2 NaN NaN NaN
2 3.0 Allen red sub4 NaN NaN NaN
3 4.0 Alice blue sub6 NaN NaN NaN
4 5.0 Ayoung green sub5 NaN NaN NaN
5 NaN Billy NaN NaN 1.0 red sub2
6 NaN Brian NaN NaN 2.0 blue sub4
7 NaN Bran NaN NaN 3.0 red sub3
8 NaN Bryce NaN NaN 4.0 red sub6
9 NaN Betty NaN NaN 5.0 blue sub5

Pivot Tables

Any pivot_table operation has a corresponding groupby operation that gives the same output, but the pivot_table displays the output in a slightly different format. Specifically, pivot_table places one (or more) of the keys you group by as columns in the output DataFrame, whereas groupby places all of the keys you group by in the index of the output DataFrame. In some cases, pivot_table and groupby can give the exact same output.

In [47]:
data = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
data
Out[47]:
A B C D E
0 foo one small 1 2
1 foo one large 2 4
2 foo one large 2 5
3 foo two small 3 5
4 foo two small 3 6
5 bar one large 4 6
6 bar one small 5 8
7 bar two small 6 9
8 bar two large 7 9
In [48]:
pd.pivot_table(data, values='D', index=['A', 'B'],
                    columns=['C'], aggfunc=np.sum, fill_value=0)
Out[48]:
C large small
A B
bar one 4 5
two 7 6
foo one 4 1
two 0 6
In [49]:
data.groupby(['A', 'B', 'C'])[['D']].agg(np.sum)
Out[49]:
D
A B C
bar one large 4
small 5
two large 7
small 6
foo one large 4
small 1
two small 6
In [50]:
pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': np.mean})
Out[50]:
D E
A C
bar large 5.500000 7.500000
small 5.500000 8.500000
foo large 2.000000 4.500000
small 2.333333 4.333333
In [51]:
data.groupby(["A", "C"])[["D", "E"]].agg(np.mean)
Out[51]:
D E
A C
bar large 5.500000 7.500000
small 5.500000 8.500000
foo large 2.000000 4.500000
small 2.333333 4.333333
In [52]:
pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': [min, max, np.mean]})
Out[52]:
D E
mean max mean min
A C
bar large 5.500000 9.0 7.500000 6.0
small 5.500000 9.0 8.500000 8.0
foo large 2.000000 5.0 4.500000 4.0
small 2.333333 6.0 4.333333 2.0
In [53]:
data.groupby(["A", "C"])[["D", "E"]].agg({'D': np.mean,
                                          'E': [min, max, np.mean]})
Out[53]:
D E
mean min max mean
A C
bar large 5.500000 6 9 7.500000
small 5.500000 8 9 8.500000
foo large 2.000000 4 5 4.500000
small 2.333333 2 6 4.333333