import pandas as pd
import numpy as np

df = pd.DataFrame({'id': [1000, 2000, 3000, 4000, 5000],
                   'name': ["Marufuku", "Jack in the Box", "Thai Basil", "Tako Sushi", "McDonald's"],
                   'cuisine': ["Japanese", "Fast Food", "Thai", "Japanese", "Fast Food"],
                   'orders': [2241, 1592, 820, 1739, 1039],
                   'rating': [4.8, 4.1, 3.9, 4.2, 4.5]})

df

Anatomy of a DataFrame¶

df.index  # default index, does not always have to be this

RangeIndex(start=0, stop=5, step=1)

df.columns

Index(['id', 'name', 'cuisine', 'orders', 'rating'], dtype='object')

df.shape

(5, 5)

Referring to Certain Parts of a DataFrame¶

Accessing particular rows: `.loc` vs. `.iloc`¶

df2 = df.set_index('id')

df2

df

df.loc[1000]  # errors because 1000 is not in the index

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    350                 try:
--> 351                     return self._range.index(new_key)
    352                 except ValueError as err:

ValueError: 1000 is not in range

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-10-cc82ac2fae9d> in <module>
----> 1 df.loc[1000]  # errors because 1000 is not in the index

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    877 
    878             maybe_callable = com.apply_if_callable(key, self.obj)
--> 879             return self._getitem_axis(maybe_callable, axis=axis)
    880 
    881     def _is_scalar_access(self, key: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1108         # fall thru to straight lookup
   1109         self._validate_key(key, axis)
-> 1110         return self._get_label(key, axis=axis)
   1111 
   1112     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1057     def _get_label(self, label, axis: int):
   1058         # GH#5667 this will fail if the label is not present in the axis.
-> 1059         return self.obj.xs(label, axis=axis)
   1060 
   1061     def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3480             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3481         else:
-> 3482             loc = self.index.get_loc(key)
   3483 
   3484             if isinstance(loc, np.ndarray):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    351                     return self._range.index(new_key)
    352                 except ValueError as err:
--> 353                     raise KeyError(key) from err
    354             raise KeyError(key)
    355         return super().get_loc(key, method=method, tolerance=tolerance)

KeyError: 1000

df2.loc[1000]

name       Marufuku
cuisine    Japanese
orders         2241
rating          4.8
Name: 1000, dtype: object

type(df2.loc[1000])  # we'll come back to what a Series is later

pandas.core.series.Series

df.iloc[3]

id               4000
name       Tako Sushi
cuisine      Japanese
orders           1739
rating            4.2
Name: 3, dtype: object

df2.iloc[3]  # this is the same as the above because both dataframes have the same restaurant in the row at index 3

name       Tako Sushi
cuisine      Japanese
orders           1739
rating            4.2
Name: 4000, dtype: object

Accessing particular columns: `pd.Series`¶

s = df['name']
s

0           Marufuku
1    Jack in the Box
2         Thai Basil
3         Tako Sushi
4         McDonald's
Name: name, dtype: object

type(s)  # each column is a Series

pandas.core.series.Series

s.index  # a Series has an index

RangeIndex(start=0, stop=5, step=1)

s.shape  # a Series also has a shape

(5,)

s.iloc[2:4]

2    Thai Basil
3    Tako Sushi
Name: name, dtype: object

s.loc[2:4]

2    Thai Basil
3    Tako Sushi
4    McDonald's
Name: name, dtype: object

s.columns  # a Series is NOT a DataFrame!

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-21-27a1b26bb635> in <module>
----> 1 s.columns  # a Series is NOT a DataFrame!

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5128             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5129                 return self[name]
-> 5130             return object.__getattribute__(self, name)
   5131 
   5132     def __setattr__(self, name: str, value) -> None:

AttributeError: 'Series' object has no attribute 'columns'

Accessing particular rows: boolean array indexing¶

df

df['cuisine'] == "Japanese"  # returns what is called a "boolean array"

0     True
1    False
2    False
3     True
4    False
Name: cuisine, dtype: bool

df[ df['cuisine'] == "Japanese" ]

df.loc[ "cuisine" == "Japanese" ]  # common mistake

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-25-21e239a7d4d1> in <module>
----> 1 df.loc[ "cuisine" == "Japanese" ]  # common mistake

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    877 
    878             maybe_callable = com.apply_if_callable(key, self.obj)
--> 879             return self._getitem_axis(maybe_callable, axis=axis)
    880 
    881     def _is_scalar_access(self, key: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1108         # fall thru to straight lookup
   1109         self._validate_key(key, axis)
-> 1110         return self._get_label(key, axis=axis)
   1111 
   1112     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1057     def _get_label(self, label, axis: int):
   1058         # GH#5667 this will fail if the label is not present in the axis.
-> 1059         return self.obj.xs(label, axis=axis)
   1060 
   1061     def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):

~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3480             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3481         else:
-> 3482             loc = self.index.get_loc(key)
   3483 
   3484             if isinstance(loc, np.ndarray):

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    352                 except ValueError as err:
    353                     raise KeyError(key) from err
--> 354             raise KeyError(key)
    355         return super().get_loc(key, method=method, tolerance=tolerance)
    356 

KeyError: False

df[ (df['cuisine'] == "Japanese") & (df['rating'] >= 4.5) ]  # can use more than 1 condition

df[ df['cuisine'] == "Japanese" & df['rating'] >= 4.5 ]  # parantheses matter!

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_logical_op(x, y, op)
    260         #  (xint or xbool) and (yint or bool)
--> 261         result = op(x, y)
    262     except TypeError:

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/roperator.py in rand_(left, right)
     51 def rand_(left, right):
---> 52     return operator.and_(right, left)
     53 

TypeError: ufunc 'bitwise_and' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_logical_op(x, y, op)
    274             try:
--> 275                 result = libops.scalar_binop(x, y, op)
    276             except (

pandas/_libs/ops.pyx in pandas._libs.ops.scalar_binop()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'double'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
<ipython-input-27-6e45bcf433c0> in <module>
----> 1 df[ df['cuisine'] == "Japanese" & df['rating'] >= 4.5 ]  # parantheses matter!

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/common.py in new_method(self, other)
     63         other = item_from_zerodim(other)
     64 
---> 65         return method(self, other)
     66 
     67     return new_method

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/__init__.py in wrapper(self, other)
    392         rvalues = extract_array(other, extract_numpy=True)
    393 
--> 394         res_values = logical_op(lvalues, rvalues, op)
    395         return self._construct_result(res_values, name=res_name)
    396 

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in logical_op(left, right, op)
    349         filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool
    350 
--> 351         res_values = na_logical_op(lvalues, rvalues, op)
    352         res_values = filler(res_values)  # type: ignore
    353 

~/anaconda3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_logical_op(x, y, op)
    285                     f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
    286                     f"and scalar of type [{typ}]"
--> 287                 ) from err
    288 
    289     return result.reshape(x.shape)

TypeError: Cannot perform 'rand_' with a dtyped [float64] array and scalar of type [bool]

Group By Operations in Pandas¶

df

df.groupby('cuisine')  # this creates an intermediate object which has the groups within it

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ee3e850>

df.groupby('cuisine').agg(np.mean)  # notice that np.mean is used on ALL numerical columns

df.groupby('cuisine')['rating'].agg(np.mean)  # np.mean is only used on the rating column

cuisine
Fast Food    4.3
Japanese     4.5
Thai         3.9
Name: rating, dtype: float64

df.groupby('cuisine')[['rating']].agg(np.mean)  # if you want it as a df instead

YOUR TURN: Find the lowest rating within each cuisine.

df.groupby('cuisine')[['rating']].agg(np.min)

Find the restaurant with the lowest rating within each cuisine

# Implementation 1: Lambda Function
df.sort_values("rating").groupby('cuisine')['name'].agg(lambda group: group.iloc[0])

cuisine
Fast Food    Jack in the Box
Japanese          Tako Sushi
Thai              Thai Basil
Name: name, dtype: object

# Implementation 2: .first() aggregator
df.sort_values("rating").groupby('cuisine')['name'].first()

cuisine
Fast Food    Jack in the Box
Japanese          Tako Sushi
Thai              Thai Basil
Name: name, dtype: object

Joins¶

left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'shirt_color':['blue','blue','red','blue','green'], 
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
   {'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'shirt_color':['red','blue','red','red','blue'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})

left

right

pd.merge(left, right, on='id')

pd.merge(left, right, on=['id', 'shirt_color'])

pd.merge(left, right, on='shirt_color')

pd.merge(left, right, on='shirt_color', how='inner')

pd.merge(left, right, on='subject_id', how='left')

pd.merge(left, right, on='shirt_color', how='left')

pd.merge(left, right, on='shirt_color', how='right')

pd.merge(left, right, on='Name', how='outer')

Pivot Tables¶

Any pivot_table operation has a corresponding groupby operation that gives the same output, but the pivot_table displays the output in a slightly different format. Specifically, pivot_table places one (or more) of the keys you group by as columns in the output DataFrame, whereas groupby places all of the keys you group by in the index of the output DataFrame. In some cases, pivot_table and groupby can give the exact same output.

data = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
data

pd.pivot_table(data, values='D', index=['A', 'B'],
                    columns=['C'], aggfunc=np.sum, fill_value=0)

data.groupby(['A', 'B', 'C'])[['D']].agg(np.sum)

pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': np.mean})

data.groupby(["A", "C"])[["D", "E"]].agg(np.mean)

pd.pivot_table(data, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': [min, max, np.mean]})

data.groupby(["A", "C"])[["D", "E"]].agg({'D': np.mean,
                                          'E': [min, max, np.mean]})

	id_x	Name	shirt_color_x	subject_id_x	id_y	shirt_color_y	subject_id_y
0	1.0	Alex	blue	sub1	NaN	NaN	NaN
1	2.0	Amy	blue	sub2	NaN	NaN	NaN
2	3.0	Allen	red	sub4	NaN	NaN	NaN
3	4.0	Alice	blue	sub6	NaN	NaN	NaN
4	5.0	Ayoung	green	sub5	NaN	NaN	NaN
5	NaN	Billy	NaN	NaN	1.0	red	sub2
6	NaN	Brian	NaN	NaN	2.0	blue	sub4
7	NaN	Bran	NaN	NaN	3.0	red	sub3
8	NaN	Bryce	NaN	NaN	4.0	red	sub6
9	NaN	Betty	NaN	NaN	5.0	blue	sub5

		D	E
		mean	max	mean	min
A	C
bar	large	5.500000	9.0	7.500000	6.0
bar	small	5.500000	9.0	8.500000	8.0
foo	large	2.000000	5.0	4.500000	4.0
foo	small	2.333333	6.0	4.333333	2.0

	id	name	cuisine	orders	rating
0	1000	Marufuku	Japanese	2241	4.8
1	2000	Jack in the Box	Fast Food	1592	4.1
2	3000	Thai Basil	Thai	820	3.9
3	4000	Tako Sushi	Japanese	1739	4.2
4	5000	McDonald's	Fast Food	1039	4.5

	id	orders	rating
cuisine
Fast Food	3500.0	1315.5	4.3
Japanese	2500.0	1990.0	4.5
Thai	3000.0	820.0	3.9