Pandas¶
This is a sample notebook for pandas, an extremely powerful python module that allows for data I/O, plotting, and general data analysis
[1]:
# As usual, first we import different packages:
# - matplotlib to plot things
# - numpy for numerical operations
# - pandas
[1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
[4]:
# 1. In this first example we create what is known as a pandas "data frame"
# This is a way to organize data that is tabular in nature, i.e., rows
# and columns.
#
# The syntax is ({ xxx }) where xxx contains the list (not in the python
# sense) of data. Each entry can be thought of as a row by column table.
# the first row, or column headings, are listed first, then a colon (:),
# then the row values in each column are listed between []'s. The first
# column is always an index.
# The example below lists a table of people, their children and pets.
[2]:
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa','jose'],
'age':[23,78,22,19,45,33,20],
'gender':['M','F','M','M','M','F','M'],
'state':['california','dc','california','dc','california','texas','texas'],
'num_children':[2,0,0,3,2,1,4],
'num_pets':[5,1,0,5,2,2,3]
})
[3]:
# To print out this data frame, just enter the DataFrame name
[4]:
df
[4]:
name | age | gender | state | num_children | num_pets | |
---|---|---|---|---|---|---|
0 | john | 23 | M | california | 2 | 5 |
1 | mary | 78 | F | dc | 0 | 1 |
2 | peter | 22 | M | california | 0 | 0 |
3 | jeff | 19 | M | dc | 3 | 5 |
4 | bill | 45 | M | california | 2 | 2 |
5 | lisa | 33 | F | texas | 1 | 2 |
6 | jose | 20 | M | texas | 4 | 3 |
[8]:
# Alternately, specify certain columns by givin their name
[5]:
type(df)
[5]:
pandas.core.frame.DataFrame
[9]:
df[['state', 'name']]
[9]:
state | name | |
---|---|---|
0 | california | john |
1 | dc | mary |
2 | california | peter |
3 | dc | jeff |
4 | california | bill |
5 | texas | lisa |
6 | texas | jose |
[10]:
# It's a little more tricky to display certain rows; for
# this use the "iloc" method, e.g., to display the third
# row fifth column (NOTE: python starts counting at 0,
# and the first column is an index not technically part
# of the DataFrame for these purposes)
[6]:
df.iloc[3,5]
[6]:
5
[12]:
# search a column for a value, print out that row
[8]:
df.loc[df['state']=='california']
[8]:
name | age | gender | state | num_children | num_pets | |
---|---|---|---|---|---|---|
0 | john | 23 | M | california | 2 | 5 |
2 | peter | 22 | M | california | 0 | 0 |
4 | bill | 45 | M | california | 2 | 2 |
[9]:
dir(df)
[9]:
['T',
'_AXIS_LEN',
'_AXIS_ORDERS',
'_AXIS_REVERSED',
'_AXIS_TO_AXIS_NUMBER',
'_HANDLED_TYPES',
'__abs__',
'__add__',
'__and__',
'__annotations__',
'__array__',
'__array_priority__',
'__array_ufunc__',
'__array_wrap__',
'__bool__',
'__class__',
'__contains__',
'__copy__',
'__deepcopy__',
'__delattr__',
'__delitem__',
'__dict__',
'__dir__',
'__divmod__',
'__doc__',
'__eq__',
'__finalize__',
'__floordiv__',
'__format__',
'__ge__',
'__getattr__',
'__getattribute__',
'__getitem__',
'__getstate__',
'__gt__',
'__hash__',
'__iadd__',
'__iand__',
'__ifloordiv__',
'__imod__',
'__imul__',
'__init__',
'__init_subclass__',
'__invert__',
'__ior__',
'__ipow__',
'__isub__',
'__iter__',
'__itruediv__',
'__ixor__',
'__le__',
'__len__',
'__lt__',
'__matmul__',
'__mod__',
'__module__',
'__mul__',
'__ne__',
'__neg__',
'__new__',
'__nonzero__',
'__or__',
'__pos__',
'__pow__',
'__radd__',
'__rand__',
'__rdivmod__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__rfloordiv__',
'__rmatmul__',
'__rmod__',
'__rmul__',
'__ror__',
'__round__',
'__rpow__',
'__rsub__',
'__rtruediv__',
'__rxor__',
'__setattr__',
'__setitem__',
'__setstate__',
'__sizeof__',
'__str__',
'__sub__',
'__subclasshook__',
'__truediv__',
'__weakref__',
'__xor__',
'_accessors',
'_accum_func',
'_add_numeric_operations',
'_agg_by_level',
'_agg_examples_doc',
'_agg_summary_and_see_also_doc',
'_align_frame',
'_align_series',
'_arith_method',
'_as_manager',
'_attrs',
'_box_col_values',
'_can_fast_transpose',
'_check_inplace_and_allows_duplicate_labels',
'_check_inplace_setting',
'_check_is_chained_assignment_possible',
'_check_label_or_level_ambiguity',
'_check_setitem_copy',
'_clear_item_cache',
'_clip_with_one_bound',
'_clip_with_scalar',
'_cmp_method',
'_combine_frame',
'_consolidate',
'_consolidate_inplace',
'_construct_axes_dict',
'_construct_axes_from_arguments',
'_construct_result',
'_constructor',
'_constructor_sliced',
'_convert',
'_count_level',
'_data',
'_dir_additions',
'_dir_deletions',
'_dispatch_frame_op',
'_drop_axis',
'_drop_labels_or_levels',
'_ensure_valid_index',
'_find_valid_index',
'_flags',
'_from_arrays',
'_from_mgr',
'_get_agg_axis',
'_get_axis',
'_get_axis_name',
'_get_axis_number',
'_get_axis_resolvers',
'_get_block_manager_axis',
'_get_bool_data',
'_get_cleaned_column_resolvers',
'_get_column_array',
'_get_index_resolvers',
'_get_item_cache',
'_get_label_or_level_values',
'_get_numeric_data',
'_get_value',
'_getitem_bool_array',
'_getitem_multilevel',
'_gotitem',
'_hidden_attrs',
'_indexed_same',
'_info_axis',
'_info_axis_name',
'_info_axis_number',
'_info_repr',
'_init_mgr',
'_inplace_method',
'_internal_names',
'_internal_names_set',
'_is_copy',
'_is_homogeneous_type',
'_is_label_or_level_reference',
'_is_label_reference',
'_is_level_reference',
'_is_mixed_type',
'_is_view',
'_iset_item',
'_iset_item_mgr',
'_iset_not_inplace',
'_item_cache',
'_iter_column_arrays',
'_ixs',
'_join_compat',
'_logical_func',
'_logical_method',
'_maybe_cache_changed',
'_maybe_update_cacher',
'_metadata',
'_mgr',
'_min_count_stat_function',
'_needs_reindex_multi',
'_protect_consolidate',
'_reduce',
'_reindex_axes',
'_reindex_columns',
'_reindex_index',
'_reindex_multi',
'_reindex_with_indexers',
'_replace_columnwise',
'_repr_data_resource_',
'_repr_fits_horizontal_',
'_repr_fits_vertical_',
'_repr_html_',
'_repr_latex_',
'_reset_cache',
'_reset_cacher',
'_sanitize_column',
'_series',
'_set_axis',
'_set_axis_name',
'_set_axis_nocheck',
'_set_is_copy',
'_set_item',
'_set_item_frame_value',
'_set_item_mgr',
'_set_value',
'_setitem_array',
'_setitem_frame',
'_setitem_slice',
'_slice',
'_stat_axis',
'_stat_axis_name',
'_stat_axis_number',
'_stat_function',
'_stat_function_ddof',
'_take_with_is_copy',
'_to_dict_of_blocks',
'_typ',
'_update_inplace',
'_validate_dtype',
'_values',
'_where',
'abs',
'add',
'add_prefix',
'add_suffix',
'age',
'agg',
'aggregate',
'align',
'all',
'any',
'append',
'apply',
'applymap',
'asfreq',
'asof',
'assign',
'astype',
'at',
'at_time',
'attrs',
'axes',
'backfill',
'between_time',
'bfill',
'bool',
'boxplot',
'clip',
'columns',
'combine',
'combine_first',
'compare',
'convert_dtypes',
'copy',
'corr',
'corrwith',
'count',
'cov',
'cummax',
'cummin',
'cumprod',
'cumsum',
'describe',
'diff',
'div',
'divide',
'dot',
'drop',
'drop_duplicates',
'droplevel',
'dropna',
'dtypes',
'duplicated',
'empty',
'eq',
'equals',
'eval',
'ewm',
'expanding',
'explode',
'ffill',
'fillna',
'filter',
'first',
'first_valid_index',
'flags',
'floordiv',
'from_dict',
'from_records',
'ge',
'gender',
'get',
'groupby',
'gt',
'head',
'hist',
'iat',
'idxmax',
'idxmin',
'iloc',
'index',
'infer_objects',
'info',
'insert',
'interpolate',
'isin',
'isna',
'isnull',
'items',
'iteritems',
'iterrows',
'itertuples',
'join',
'keys',
'kurt',
'kurtosis',
'last',
'last_valid_index',
'le',
'loc',
'lookup',
'lt',
'mad',
'mask',
'max',
'mean',
'median',
'melt',
'memory_usage',
'merge',
'min',
'mod',
'mode',
'mul',
'multiply',
'name',
'ndim',
'ne',
'nlargest',
'notna',
'notnull',
'nsmallest',
'num_children',
'num_pets',
'nunique',
'pad',
'pct_change',
'pipe',
'pivot',
'pivot_table',
'plot',
'pop',
'pow',
'prod',
'product',
'quantile',
'query',
'radd',
'rank',
'rdiv',
'reindex',
'reindex_like',
'rename',
'rename_axis',
'reorder_levels',
'replace',
'resample',
'reset_index',
'rfloordiv',
'rmod',
'rmul',
'rolling',
'round',
'rpow',
'rsub',
'rtruediv',
'sample',
'select_dtypes',
'sem',
'set_axis',
'set_flags',
'set_index',
'shape',
'shift',
'size',
'skew',
'slice_shift',
'sort_index',
'sort_values',
'squeeze',
'stack',
'state',
'std',
'style',
'sub',
'subtract',
'sum',
'swapaxes',
'swaplevel',
'tail',
'take',
'to_clipboard',
'to_csv',
'to_dict',
'to_excel',
'to_feather',
'to_gbq',
'to_hdf',
'to_html',
'to_json',
'to_latex',
'to_markdown',
'to_numpy',
'to_parquet',
'to_period',
'to_pickle',
'to_records',
'to_sql',
'to_stata',
'to_string',
'to_timestamp',
'to_xarray',
'to_xml',
'transform',
'transpose',
'truediv',
'truncate',
'tz_convert',
'tz_localize',
'unstack',
'update',
'value_counts',
'values',
'var',
'where',
'xs']
[14]:
# search the table for values in a column, print
# corresponding values in other column
[10]:
df['name'].where(df['num_pets']>3)
[10]:
0 john
1 NaN
2 NaN
3 jeff
4 NaN
5 NaN
6 NaN
Name: name, dtype: object
[11]:
df.groupby('state')['name'].count()
[11]:
state
california 3
dc 2
texas 2
Name: name, dtype: int64
[17]:
# 2. Now we make some plots. There are many ways to do this, including
# both pandas and matplotlib. Here we will plot one column against
# another
[14]:
# plot num_children vs num_pets as line (doesn't really make sense)
df.plot(kind='scatter',x='num_children',y='num_pets',color='red')
[14]:
<AxesSubplot:xlabel='num_children', ylabel='num_pets'>

[15]:
# plot num_children vs num_pets as a scatter plot
[16]:
df.plot(kind='scatter',x='num_children',y='num_pets',color='blue')
[16]:
<AxesSubplot:xlabel='num_children', ylabel='num_pets'>

[17]:
# plot name vs age as bar chart (note we can do this even though
# 'name' is not a number since it's a bar chart)
[18]:
df.plot(kind='bar',x='name',y='age')
[18]:
<AxesSubplot:xlabel='name'>

[19]:
# now make a single plot with two lines
# This:
#
#df.plot(kind='line',x='name',y='num_children',color='blue')
#df.plot(kind='line',x='name',y='num_pets',color='red')
#
# would actually give two separate plots, one above the
# the other. To out both on a single graph, we make use of
# some matplotlib functions
[21]:
# a. clear the graph
plt.clf()
# b. use matplotlib "gca", or "get current axis"
ax = plt.gca()
# c. make two plots, and use the same axes
df.plot(kind='line',x='name',y='num_children',ax=ax)
df.plot(kind='line',x='name',y='num_pets', color='red', ax=ax)
[21]:
<AxesSubplot:xlabel='name'>

[22]:
# we can
[24]:
df.groupby('state')['name'].count().plot(kind='bar')
[24]:
<AxesSubplot:xlabel='state'>

[25]:
df.groupby('state').size().plot(kind='bar')
[25]:
<AxesSubplot:xlabel='state'>

[26]:
df.groupby(['state','gender']).size().unstack().plot(kind='bar',stacked=True)
plt.show()

[29]:
plt.clf()
df.groupby(['gender','state']).size().unstack().plot(kind='bar',stacked=True)
plt.legend(loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()
<Figure size 432x288 with 0 Axes>

plot histogram for a dataframe column¶
[30]:
df[['age']].plot(kind='hist',bins=[0,20,40,60,80,100],rwidth=0.9,)
[30]:
<AxesSubplot:ylabel='Frequency'>

percent single¶
[31]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
# create dummy variable them group by that
# set the legend to false because we'll fix it later
df.assign(
dummy = 1
).groupby(['dummy','state']).size().groupby(level=0).apply(
lambda x: 100 * x / x.sum()
).to_frame().unstack().plot(kind='bar',stacked=True,legend=False)
plt.title('Amount of records by State, normalized')
# other it'll show up as 'dummy'
plt.xlabel('state')
# disable ticks in the x axis
plt.xticks([])
# fix the legend
current_handles, _ = plt.gca().get_legend_handles_labels()
reversed_handles = reversed(current_handles)
labels = reversed(df['state'].unique())
plt.legend(reversed_handles,labels,loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()

percent, two-level group by¶
[32]:
import matplotlib.ticker as mtick
df.groupby(['gender','state']).size().groupby(level=0).apply(
lambda x: 100 * x / x.sum()
).unstack().plot(kind='bar',stacked=True,legend='reverse')
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.title('Amount of records by Gender and State, normalized')
# plt.legend(loc='lower right')
plt.gcf().set_size_inches(7,4)
plt.show()

date histograms¶
[33]:
df = pd.DataFrame({
'name':['john','lisa','peter','carl','linda','betty'],
'date_of_birth':[
'01/21/1988','03/10/1977','07/25/1999','01/22/1977','09/30/1968','09/15/1970'
]
})
df
[33]:
name | date_of_birth | |
---|---|---|
0 | john | 01/21/1988 |
1 | lisa | 03/10/1977 |
2 | peter | 07/25/1999 |
3 | carl | 01/22/1977 |
4 | linda | 09/30/1968 |
5 | betty | 09/15/1970 |
[34]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'],infer_datetime_format=True)
df
[34]:
name | date_of_birth | |
---|---|---|
0 | john | 1988-01-21 |
1 | lisa | 1977-03-10 |
2 | peter | 1999-07-25 |
3 | carl | 1977-01-22 |
4 | linda | 1968-09-30 |
5 | betty | 1970-09-15 |
[35]:
df.dtypes
[35]:
name object
date_of_birth datetime64[ns]
dtype: object
[36]:
plt.clf()
df['date_of_birth'].map(lambda d: d.month).plot(kind='hist')
plt.xlabel('Month number')
plt.show()

Now try dates¶
[37]:
# import modules
from datetime import datetime
from dateutil.parser import parse
import pandas as pd
[38]:
# Create string variable and convert to datetime format
war_start = '2011-01-03'
datetime.strptime(war_start, '%Y-%m-%d')
[38]:
datetime.datetime(2011, 1, 3, 0, 0)
[39]:
# create a list of string variables and convert to datetime format
attack_dates = ['7/2/2011', '8/6/2012', '11/13/2013', '5/26/2011', '5/2/2001']
[datetime.strptime(x, '%m/%d/%Y') for x in attack_dates]
[39]:
[datetime.datetime(2011, 7, 2, 0, 0),
datetime.datetime(2012, 8, 6, 0, 0),
datetime.datetime(2013, 11, 13, 0, 0),
datetime.datetime(2011, 5, 26, 0, 0),
datetime.datetime(2001, 5, 2, 0, 0)]
[40]:
# Use parse() to attempt to auto-convert common string formats
parse(war_start)
[40]:
datetime.datetime(2011, 1, 3, 0, 0)
[41]:
# Use parse() on every element of the attack_dates string
[parse(x) for x in attack_dates]
[41]:
[datetime.datetime(2011, 7, 2, 0, 0),
datetime.datetime(2012, 8, 6, 0, 0),
datetime.datetime(2013, 11, 13, 0, 0),
datetime.datetime(2011, 5, 26, 0, 0),
datetime.datetime(2001, 5, 2, 0, 0)]
[42]:
# Use parse, but designate that the day is first
parse(war_start, dayfirst=True)
[42]:
datetime.datetime(2011, 3, 1, 0, 0)
[43]:
# Create a dataframe
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'value': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
df = pd.DataFrame(data, columns = ['date', 'value'])
print(df)
date value
0 2014-05-01 18:47:05.069722 1
1 2014-05-01 18:47:05.119994 1
2 2014-05-02 18:47:05.178768 1
3 2014-05-02 18:47:05.230071 1
4 2014-05-02 18:47:05.230071 1
5 2014-05-02 18:47:05.280592 1
6 2014-05-03 18:47:05.332662 1
7 2014-05-03 18:47:05.385109 1
8 2014-05-04 18:47:05.436523 1
9 2014-05-04 18:47:05.486877 1
[44]:
# Convert df['date'] from string to datetime
pd.to_datetime(df['date'])
[44]:
0 2014-05-01 18:47:05.069722
1 2014-05-01 18:47:05.119994
2 2014-05-02 18:47:05.178768
3 2014-05-02 18:47:05.230071
4 2014-05-02 18:47:05.230071
5 2014-05-02 18:47:05.280592
6 2014-05-03 18:47:05.332662
7 2014-05-03 18:47:05.385109
8 2014-05-04 18:47:05.436523
9 2014-05-04 18:47:05.486877
Name: date, dtype: datetime64[ns]