Pandas

This is a sample notebook for pandas, an extremely powerful python module that allows for data I/O, plotting, and general data analysis

[1]:
# As usual, first we import different packages:
#  - matplotlib to plot things
#  - numpy for numerical operations
#  - pandas
[1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
[4]:
# 1.  In this first example we create what is known as a pandas "data frame"
#    This is a way to organize data that is tabular in nature, i.e., rows
#    and columns.
#
#    The syntax is ({ xxx }) where xxx contains the list (not in the python
#    sense) of data.  Each entry can be thought of as a row by column table.
#    the first row, or column headings, are listed first, then a colon (:),
#    then the row values in each column are listed between []'s.  The first
#    column is always an index.
#    The example below lists a table of people, their children and pets.
[2]:
df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa','jose'],
    'age':[23,78,22,19,45,33,20],
    'gender':['M','F','M','M','M','F','M'],
    'state':['california','dc','california','dc','california','texas','texas'],
    'num_children':[2,0,0,3,2,1,4],
    'num_pets':[5,1,0,5,2,2,3]
})
[3]:
# To print out this data frame, just enter the DataFrame name
[4]:
df
[4]:
name age gender state num_children num_pets
0 john 23 M california 2 5
1 mary 78 F dc 0 1
2 peter 22 M california 0 0
3 jeff 19 M dc 3 5
4 bill 45 M california 2 2
5 lisa 33 F texas 1 2
6 jose 20 M texas 4 3
[8]:
# Alternately, specify certain columns by givin their name
[5]:
type(df)
[5]:
pandas.core.frame.DataFrame
[9]:
df[['state', 'name']]
[9]:
state name
0 california john
1 dc mary
2 california peter
3 dc jeff
4 california bill
5 texas lisa
6 texas jose
[10]:
# It's a little more tricky to display certain rows; for
# this use the "iloc" method, e.g., to display the third
# row fifth column (NOTE: python starts counting at 0,
# and the first column is an index not technically part
# of the DataFrame for these purposes)
[6]:
df.iloc[3,5]
[6]:
5
[12]:
# search a column for a value, print out that row
[8]:
df.loc[df['state']=='california']
[8]:
name age gender state num_children num_pets
0 john 23 M california 2 5
2 peter 22 M california 0 0
4 bill 45 M california 2 2
[9]:
dir(df)
[9]:
['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmatmul__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__round__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '__xor__',
 '_accessors',
 '_accum_func',
 '_add_numeric_operations',
 '_agg_by_level',
 '_agg_examples_doc',
 '_agg_summary_and_see_also_doc',
 '_align_frame',
 '_align_series',
 '_arith_method',
 '_as_manager',
 '_attrs',
 '_box_col_values',
 '_can_fast_transpose',
 '_check_inplace_and_allows_duplicate_labels',
 '_check_inplace_setting',
 '_check_is_chained_assignment_possible',
 '_check_label_or_level_ambiguity',
 '_check_setitem_copy',
 '_clear_item_cache',
 '_clip_with_one_bound',
 '_clip_with_scalar',
 '_cmp_method',
 '_combine_frame',
 '_consolidate',
 '_consolidate_inplace',
 '_construct_axes_dict',
 '_construct_axes_from_arguments',
 '_construct_result',
 '_constructor',
 '_constructor_sliced',
 '_convert',
 '_count_level',
 '_data',
 '_dir_additions',
 '_dir_deletions',
 '_dispatch_frame_op',
 '_drop_axis',
 '_drop_labels_or_levels',
 '_ensure_valid_index',
 '_find_valid_index',
 '_flags',
 '_from_arrays',
 '_from_mgr',
 '_get_agg_axis',
 '_get_axis',
 '_get_axis_name',
 '_get_axis_number',
 '_get_axis_resolvers',
 '_get_block_manager_axis',
 '_get_bool_data',
 '_get_cleaned_column_resolvers',
 '_get_column_array',
 '_get_index_resolvers',
 '_get_item_cache',
 '_get_label_or_level_values',
 '_get_numeric_data',
 '_get_value',
 '_getitem_bool_array',
 '_getitem_multilevel',
 '_gotitem',
 '_hidden_attrs',
 '_indexed_same',
 '_info_axis',
 '_info_axis_name',
 '_info_axis_number',
 '_info_repr',
 '_init_mgr',
 '_inplace_method',
 '_internal_names',
 '_internal_names_set',
 '_is_copy',
 '_is_homogeneous_type',
 '_is_label_or_level_reference',
 '_is_label_reference',
 '_is_level_reference',
 '_is_mixed_type',
 '_is_view',
 '_iset_item',
 '_iset_item_mgr',
 '_iset_not_inplace',
 '_item_cache',
 '_iter_column_arrays',
 '_ixs',
 '_join_compat',
 '_logical_func',
 '_logical_method',
 '_maybe_cache_changed',
 '_maybe_update_cacher',
 '_metadata',
 '_mgr',
 '_min_count_stat_function',
 '_needs_reindex_multi',
 '_protect_consolidate',
 '_reduce',
 '_reindex_axes',
 '_reindex_columns',
 '_reindex_index',
 '_reindex_multi',
 '_reindex_with_indexers',
 '_replace_columnwise',
 '_repr_data_resource_',
 '_repr_fits_horizontal_',
 '_repr_fits_vertical_',
 '_repr_html_',
 '_repr_latex_',
 '_reset_cache',
 '_reset_cacher',
 '_sanitize_column',
 '_series',
 '_set_axis',
 '_set_axis_name',
 '_set_axis_nocheck',
 '_set_is_copy',
 '_set_item',
 '_set_item_frame_value',
 '_set_item_mgr',
 '_set_value',
 '_setitem_array',
 '_setitem_frame',
 '_setitem_slice',
 '_slice',
 '_stat_axis',
 '_stat_axis_name',
 '_stat_axis_number',
 '_stat_function',
 '_stat_function_ddof',
 '_take_with_is_copy',
 '_to_dict_of_blocks',
 '_typ',
 '_update_inplace',
 '_validate_dtype',
 '_values',
 '_where',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'age',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'append',
 'apply',
 'applymap',
 'asfreq',
 'asof',
 'assign',
 'astype',
 'at',
 'at_time',
 'attrs',
 'axes',
 'backfill',
 'between_time',
 'bfill',
 'bool',
 'boxplot',
 'clip',
 'columns',
 'combine',
 'combine_first',
 'compare',
 'convert_dtypes',
 'copy',
 'corr',
 'corrwith',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'describe',
 'diff',
 'div',
 'divide',
 'dot',
 'drop',
 'drop_duplicates',
 'droplevel',
 'dropna',
 'dtypes',
 'duplicated',
 'empty',
 'eq',
 'equals',
 'eval',
 'ewm',
 'expanding',
 'explode',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'flags',
 'floordiv',
 'from_dict',
 'from_records',
 'ge',
 'gender',
 'get',
 'groupby',
 'gt',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'index',
 'infer_objects',
 'info',
 'insert',
 'interpolate',
 'isin',
 'isna',
 'isnull',
 'items',
 'iteritems',
 'iterrows',
 'itertuples',
 'join',
 'keys',
 'kurt',
 'kurtosis',
 'last',
 'last_valid_index',
 'le',
 'loc',
 'lookup',
 'lt',
 'mad',
 'mask',
 'max',
 'mean',
 'median',
 'melt',
 'memory_usage',
 'merge',
 'min',
 'mod',
 'mode',
 'mul',
 'multiply',
 'name',
 'ndim',
 'ne',
 'nlargest',
 'notna',
 'notnull',
 'nsmallest',
 'num_children',
 'num_pets',
 'nunique',
 'pad',
 'pct_change',
 'pipe',
 'pivot',
 'pivot_table',
 'plot',
 'pop',
 'pow',
 'prod',
 'product',
 'quantile',
 'query',
 'radd',
 'rank',
 'rdiv',
 'reindex',
 'reindex_like',
 'rename',
 'rename_axis',
 'reorder_levels',
 'replace',
 'resample',
 'reset_index',
 'rfloordiv',
 'rmod',
 'rmul',
 'rolling',
 'round',
 'rpow',
 'rsub',
 'rtruediv',
 'sample',
 'select_dtypes',
 'sem',
 'set_axis',
 'set_flags',
 'set_index',
 'shape',
 'shift',
 'size',
 'skew',
 'slice_shift',
 'sort_index',
 'sort_values',
 'squeeze',
 'stack',
 'state',
 'std',
 'style',
 'sub',
 'subtract',
 'sum',
 'swapaxes',
 'swaplevel',
 'tail',
 'take',
 'to_clipboard',
 'to_csv',
 'to_dict',
 'to_excel',
 'to_feather',
 'to_gbq',
 'to_hdf',
 'to_html',
 'to_json',
 'to_latex',
 'to_markdown',
 'to_numpy',
 'to_parquet',
 'to_period',
 'to_pickle',
 'to_records',
 'to_sql',
 'to_stata',
 'to_string',
 'to_timestamp',
 'to_xarray',
 'to_xml',
 'transform',
 'transpose',
 'truediv',
 'truncate',
 'tz_convert',
 'tz_localize',
 'unstack',
 'update',
 'value_counts',
 'values',
 'var',
 'where',
 'xs']
[14]:
# search the table for values in a column, print
# corresponding values in other column
[10]:
df['name'].where(df['num_pets']>3)
[10]:
0    john
1     NaN
2     NaN
3    jeff
4     NaN
5     NaN
6     NaN
Name: name, dtype: object
[11]:
df.groupby('state')['name'].count()
[11]:
state
california    3
dc            2
texas         2
Name: name, dtype: int64
[17]:
# 2. Now we make some plots.  There are many ways to do this, including
#   both pandas and matplotlib.  Here we will plot one column against
#   another
[14]:
# plot num_children vs num_pets as line (doesn't really make sense)
df.plot(kind='scatter',x='num_children',y='num_pets',color='red')
[14]:
<AxesSubplot:xlabel='num_children', ylabel='num_pets'>
../_images/notebooks_0202_Pandas_19_1.png
[15]:
# plot num_children vs num_pets as a scatter plot
[16]:
df.plot(kind='scatter',x='num_children',y='num_pets',color='blue')
[16]:
<AxesSubplot:xlabel='num_children', ylabel='num_pets'>
../_images/notebooks_0202_Pandas_21_1.png
[17]:
# plot name vs age as bar chart (note we can do this even though
#  'name' is not a number since it's a bar chart)
[18]:
df.plot(kind='bar',x='name',y='age')
[18]:
<AxesSubplot:xlabel='name'>
../_images/notebooks_0202_Pandas_23_1.png
[19]:
# now make a single plot with two lines
# This:
#
#df.plot(kind='line',x='name',y='num_children',color='blue')
#df.plot(kind='line',x='name',y='num_pets',color='red')
#
# would actually give two separate plots, one above the
# the other.  To out both on a single graph, we make use of
# some matplotlib functions
[21]:
# a. clear the graph
plt.clf()

# b. use matplotlib "gca", or "get current axis"
ax = plt.gca()

# c. make two plots, and use the same axes
df.plot(kind='line',x='name',y='num_children',ax=ax)
df.plot(kind='line',x='name',y='num_pets', color='red', ax=ax)
[21]:
<AxesSubplot:xlabel='name'>
../_images/notebooks_0202_Pandas_25_1.png
[22]:
# we can
[24]:
df.groupby('state')['name'].count().plot(kind='bar')
[24]:
<AxesSubplot:xlabel='state'>
../_images/notebooks_0202_Pandas_27_1.png
[25]:
df.groupby('state').size().plot(kind='bar')
[25]:
<AxesSubplot:xlabel='state'>
../_images/notebooks_0202_Pandas_28_1.png
[26]:
df.groupby(['state','gender']).size().unstack().plot(kind='bar',stacked=True)
plt.show()
../_images/notebooks_0202_Pandas_29_0.png
[29]:
plt.clf()
df.groupby(['gender','state']).size().unstack().plot(kind='bar',stacked=True)
plt.legend(loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()
<Figure size 432x288 with 0 Axes>
../_images/notebooks_0202_Pandas_30_1.png

plot histogram for a dataframe column

[30]:
df[['age']].plot(kind='hist',bins=[0,20,40,60,80,100],rwidth=0.9,)
[30]:
<AxesSubplot:ylabel='Frequency'>
../_images/notebooks_0202_Pandas_32_1.png

percent single

[31]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

# create dummy variable them group by that
# set the legend to false because we'll fix it later
df.assign(
 dummy = 1
).groupby(['dummy','state']).size().groupby(level=0).apply(
    lambda x: 100 * x / x.sum()
).to_frame().unstack().plot(kind='bar',stacked=True,legend=False)


plt.title('Amount of records by State, normalized')


# other it'll show up as 'dummy'
plt.xlabel('state')

# disable ticks in the x axis
plt.xticks([])

# fix the legend
current_handles, _ = plt.gca().get_legend_handles_labels()
reversed_handles = reversed(current_handles)

labels = reversed(df['state'].unique())

plt.legend(reversed_handles,labels,loc='lower right')

plt.gcf().set_size_inches(7,4)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()
../_images/notebooks_0202_Pandas_34_0.png

percent, two-level group by

[32]:
import matplotlib.ticker as mtick

df.groupby(['gender','state']).size().groupby(level=0).apply(
    lambda x: 100 * x / x.sum()
).unstack().plot(kind='bar',stacked=True,legend='reverse')

plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.title('Amount of records by Gender and State, normalized')
# plt.legend(loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()
../_images/notebooks_0202_Pandas_36_0.png

date histograms

[33]:
df = pd.DataFrame({
    'name':['john','lisa','peter','carl','linda','betty'],
    'date_of_birth':[
        '01/21/1988','03/10/1977','07/25/1999','01/22/1977','09/30/1968','09/15/1970'
    ]
})

df
[33]:
name date_of_birth
0 john 01/21/1988
1 lisa 03/10/1977
2 peter 07/25/1999
3 carl 01/22/1977
4 linda 09/30/1968
5 betty 09/15/1970
[34]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'],infer_datetime_format=True)
df
[34]:
name date_of_birth
0 john 1988-01-21
1 lisa 1977-03-10
2 peter 1999-07-25
3 carl 1977-01-22
4 linda 1968-09-30
5 betty 1970-09-15
[35]:
df.dtypes
[35]:
name                     object
date_of_birth    datetime64[ns]
dtype: object
[36]:
plt.clf()
df['date_of_birth'].map(lambda d: d.month).plot(kind='hist')
plt.xlabel('Month number')
plt.show()
../_images/notebooks_0202_Pandas_41_0.png

Now try dates

[37]:
# import modules
from datetime import datetime
from dateutil.parser import parse
import pandas as pd
[38]:
# Create string variable and convert to datetime format
war_start = '2011-01-03'
datetime.strptime(war_start, '%Y-%m-%d')
[38]:
datetime.datetime(2011, 1, 3, 0, 0)
[39]:
# create a list of string variables and convert to datetime format
attack_dates = ['7/2/2011', '8/6/2012', '11/13/2013', '5/26/2011', '5/2/2001']
[datetime.strptime(x, '%m/%d/%Y') for x in attack_dates]
[39]:
[datetime.datetime(2011, 7, 2, 0, 0),
 datetime.datetime(2012, 8, 6, 0, 0),
 datetime.datetime(2013, 11, 13, 0, 0),
 datetime.datetime(2011, 5, 26, 0, 0),
 datetime.datetime(2001, 5, 2, 0, 0)]
[40]:
# Use parse() to attempt to auto-convert common string formats
parse(war_start)
[40]:
datetime.datetime(2011, 1, 3, 0, 0)
[41]:
# Use parse() on every element of the attack_dates string
[parse(x) for x in attack_dates]
[41]:
[datetime.datetime(2011, 7, 2, 0, 0),
 datetime.datetime(2012, 8, 6, 0, 0),
 datetime.datetime(2013, 11, 13, 0, 0),
 datetime.datetime(2011, 5, 26, 0, 0),
 datetime.datetime(2001, 5, 2, 0, 0)]
[42]:
# Use parse, but designate that the day is first
parse(war_start, dayfirst=True)
[42]:
datetime.datetime(2011, 3, 1, 0, 0)
[43]:
# Create a dataframe

data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
        'value': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
df = pd.DataFrame(data, columns = ['date', 'value'])
print(df)
                         date  value
0  2014-05-01 18:47:05.069722      1
1  2014-05-01 18:47:05.119994      1
2  2014-05-02 18:47:05.178768      1
3  2014-05-02 18:47:05.230071      1
4  2014-05-02 18:47:05.230071      1
5  2014-05-02 18:47:05.280592      1
6  2014-05-03 18:47:05.332662      1
7  2014-05-03 18:47:05.385109      1
8  2014-05-04 18:47:05.436523      1
9  2014-05-04 18:47:05.486877      1
[44]:
# Convert df['date'] from string to datetime
pd.to_datetime(df['date'])
[44]:
0   2014-05-01 18:47:05.069722
1   2014-05-01 18:47:05.119994
2   2014-05-02 18:47:05.178768
3   2014-05-02 18:47:05.230071
4   2014-05-02 18:47:05.230071
5   2014-05-02 18:47:05.280592
6   2014-05-03 18:47:05.332662
7   2014-05-03 18:47:05.385109
8   2014-05-04 18:47:05.436523
9   2014-05-04 18:47:05.486877
Name: date, dtype: datetime64[ns]