2017 Alexander C.S. Hendorf, Königsweg GmbH, Mannheim
%run '2 helpers.py'
%config InlineBackend.figure_format = 'retina';
import numpy as np
import pandas as pd
import random
import seaborn as sns
%matplotlib inline
Series = Column
Series do have a data tpye.
series = pd.Series([3, 62, 75, 83, 47, 43, 39, 16, 19, 2])
series
pd.Series([3, 62, 75, 83, 47, 43, 39, 16, 19, 2, 2.66676])
pd.Series([3, 62, 75, 83, 47, 43, 39, 16, 19, 2, "2"])
# Falle: int -> Float bedingt durch Implementierung in NumPy.
pd.Series([3, None, 6])
series[0]
series[3:6]
# series[3:6]
series.iloc[3:6]
# set alpha label as new index for the series
series.index = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"][:len(series)]
series
series['C']
series.loc['C']
series['D':'F']
# by label: slice includes end!
series[2]
# position, pythonic
# set alpha label as new index for the series
series.index = [x for x in "GATTACAXYZ"][:len(series)]
series
series.loc['G']
try:
series.loc['G':'A']
except Exception as e:
print("ERROR:", e)
# non-unique values breaks slicing
series.loc['X':'Z']
# while unique values are still slicable in a non-unique index
series.index.is_monotonic
series.index.is_unique
series.index = range(series.size)
series
series
series.index.is_unique
series.index.is_monotonic
series.index.is_monotonic_increasing
df = pd.read_json('data/sampledf.json')
df
DataFrame[n] selektiert die Spalte an der Position n
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 2]))
df[2]
Das Slice DataFrame[start:stop] selektiert die Zeilen von start bis stop (stop ist exklusive!)
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice[range(2, 4), :]))
# column
df[2:4]
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice[range(2, 4), range(2, 4)]))
# segment
df.iloc[2:4, 2:4]
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, range(2, 4)]))
# column slice
df.iloc[:, 2:4]
df
df.index = ["R{:02d}".format(i) for i in range(len(df))]
df.columns = ["C{:02d}".format(i) for i in range(len(df.columns))]
df
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 'C05']))
df['C05']
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice['R02':'R05', :]))
df['R02':'R05']
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice[:, 'C02':'C05']))
df.loc[:, 'C02':'C05']
# visualisation of below - for presentation
display_html(df.style.apply(highlight, subset=pd.IndexSlice['R02':'R05', 'C04':'C05']))
# segment
df.loc['R02':'R05', 'C04':'C05']
sales_data = pd.read_excel('data/blooth_sales_data_clean.xlsx')
sales_data.head(5)
sales_data.info()
Ein Boolean Index (oder mask) ist ein Array mit True/False Werten: [1, 0, 1, 1, 0, 0, 1, …]
sales_data['units'] > 40
sales_data[
sales_data['units'] > 40
]
sales_data[
(sales_data['units'] > 40) & (sales_data['unitprice'] < 10) # AND
]
sales_data[
(sales_data['units'] > 40) | (sales_data['unitprice'] < 10) # OR
]
somany = 20
dfm = pd.DataFrame(
{'city':[x for x in ['Paris', 'London', 'Berlin', 'New York', 'Mannheim']*10][:20],
'category': [random.randint(1, 3) for x in range(somany)],
'price': [random.randint(150, 300) for x in range(somany)],
'rating': [random.randint(0, 10) for x in range(somany)]})
dfm['country'] = dfm['city'].map(
{'Paris': 'FR', 'London': 'GB', 'Berlin': 'DE', 'New York': 'US', 'Mannheim': 'DE'})
dfm
dfg = dfm.groupby(['country', 'city', 'category']).mean()
dfg
dfg.index
dfg.index.levels
dfg.index.names
pd.set_option('display.multi_sparse', False) # Tipp
dfg
pd.set_option('display.multi_sparse', True)
dfg.index.get_level_values(2)
dfg.index.get_level_values(1)
dfg.loc['DE']
dfg.loc[('DE', 'Mannheim')]
dfg.loc[('DE', 'Mannheim')].max() # max of column
dfg.loc[('DE', 'Mannheim', 3)]
type(dfg.loc[('DE', 'Mannheim', 3)]) # know the type you're working with
dfg.unstack()
dfg.unstack().unstack()
df = pd.read_csv('data/tempm.csv', header=None)
df.columns = ['timestamp', 'celsius']
df.head(5)
df[:100].plot()
df.info()
df.index
df.index = pd.to_datetime(df['timestamp'])
Easy way to work with timeseries
df.head()
df.index
df.index.is_monotonic
df.index.is_unique
df[:100].plot()
df['celsius'].groupby([df.index.year, df.index.week]).mean().plot.bar();
df['celsius'].groupby([df.index.year, df.index.dayofyear]).mean().plot();
df['celsius'].groupby([df.index.year, df.index.dayofyear]).agg([min, max]).plot();
df['weekday'] = df.index.weekday
df['weekend'] = df['weekday'].isin({5, 6})
df.groupby(['weekend', df.index.hour])['celsius'].mean().unstack(level=0).plot()
# selecting ranges
df["2014-08-27":"2014-08-28"]['celsius'].plot()
df[(df.index.hour > 12) & (df.index.hour <=16)]['celsius'].plot()
df['celsius'].resample('D').max().head()
df['celsius'].resample('M').mean().head()
df['celsius'].resample('3D').mean().plot();
df['celsius'].resample('D').agg(['min', 'max']).plot();
sales_data.head()
sales_data.info()
sales_data.memory_usage()
m1 = sales_data.memory_usage(deep=True)
m1
sales_data = sales_data.astype({'customer': 'category', 'product': 'category', 'name': 'category'})
m2 = sales_data.memory_usage(deep=True)
m2
m2/m1
m1.sum()-m2.sum(), m2.sum()/m1.sum()
sales_data.index = sales_data['product']
sales_data.head()
favorites = ['Harry Potter book', 'Rubik’s Cube', 'Corolla', 'Rubik’s Cube', 'iPhone', 'iPad',
'PlayStation', 'Star Wars', 'Thriller record',
'banana', 'Lipitor']
pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
a = pd.Categorical(values=['bad', 'neutral', 'nice', 'good', 'excellent'],
categories=['bad', 'neutral', 'nice', 'good', 'excellent'],
ordered=True)
a.max()
a.min()