A
reduction
or
aggregation
is an operation that takes an entire
Series
and produces a result that is a single number
(or a single string).
import sys import numpy as np import pandas as pd data = [10.0, 20.0, 30.0, 40.0, 50.0] series = pd.Series(data = data, name = "prices") series.index.name = "day" print(series) print() print(f"{series.count() = }") print(f"{series.sum() = }") #also try prod print(f"{series.mean() = }") print(f"{series.median() = }") print() print(f"{series.min() = }") print(f"{series.max() = }") print(f"{series.idxmin() = }") #index of the smallest value print(f"{series.idxmax() = }") print(f"{np.argmin(series.array) = }") #position (an integer, like iloc) of the smallest value print(f"{np.argmax(series.array) = }") print() print(series.describe()) #returns a Series containing 8 rows sys.exit(0)
day 0 10.0 1 20.0 2 30.0 3 40.0 4 50.0 Name: prices, dtype: float64 series.count() = 5 series.sum() = 150.0 series.mean() = 30.0 series.median() = 30.0 series.min() = 10.0 series.max() = 50.0 series.idxmin() = 0 series.idxmax() = 4 np.argmin(series.array) = 0 np.argmax(series.array) = 4 count 5.000000 mean 30.000000 std 15.811388 min 10.000000 25% 20.000000 50% 30.000000 75% 40.000000 max 50.000000 Name: prices, dtype: float64
import sys import numpy as np import pandas as pd data = [0.0, 10.0, np.nan, 30.0, None] #not a number series = pd.Series(data = data) print(series) print() print(f"{len(series) = }") print(f"{series.size = }") #also try series.shape print(f"{series.count() = }") print() print(f"{series.sum() = }") #ignores np.nan print(f"{series.sum(skipna = False) = }") sys.exit(0)
0 0.0 1 10.0 2 NaN 3 30.0 4 NaN dtype: float64 len(series) = 5 series.size = 5 series.count() = 3 series.sum() = 40.0 series.sum(skipna = False) = nan
"Covariance and correlation." import sys import math import pandas as pd #data about N stores N = 7 data = [ [ 2, 5, 1, 3, 4, 1, 5], #number of commercials for each store [24, 28, 22, 26, 25, 24, 26] #sales volume in hundreds of dollars ] series0 = pd.Series(data = data[0], name = "Number of Commercials") series1 = pd.Series(data = data[1], name = "Sales Volume in Hundreds") print("sample variance:") print(series0.var()) print(((series0 - series0.mean()) ** 2).sum() / (N - 1)) print() print("sample standard deviation:") print(series0.std()) print(math.sqrt(series0.var())) print() print("sample covariance:") print(series0.cov(series1)) difference0 = series0 - series0.mean() difference1 = series1 - series1.mean() print((difference0 * difference1).sum() / (N - 1)) print() print("sample correlation coefficient:") print(series0.corr(series1)) print(series0.cov(series1) / (series0.std() * series1.std())) sys.exit(0)
sample variance: 3.0 3.0 sample standard deviation: 1.7320508075688772 1.7320508075688772 sample covariance: 2.833333333333333 2.8333333333333335 sample correlation coefficiant: 0.8542821429703302 0.8542821429703302
"Scatter plot of a pair of pd.Serieses." import pandas as pd import matplotlib.pyplot as plt #data about 7 stores data = [ [ 2, 5, 1, 3, 4, 1, 5], #number of commercials for each store [24, 28, 22, 26, 25, 24, 26] #sales volume in hundreds of dollars ] series0 = pd.Series(data = data[0], name = "commercials") series1 = pd.Series(data = data[1], name = "volume") df = pd.concat([series0, series1], axis = 1) #Create a pd.DataFrame containing 2 columns. print(df) axes = df.plot.scatter( x = "commercials", y = "volume", figsize = [6.4, 4.8], #DataFrame.plot.scatter creates a new Figure. color = "#1f77b4", #red, green, blue grid = False, marker = "o", #style of marker; also try "s" s = 25 #size of marker in points ) figure = plt.gcf() figure.canvas.set_window_title("matplotlib DataFrame.plot.scatter") axes.set_title("Scatter Plot") axes.set_xlabel("number of commercials") axes.set_ylabel("sales volume in hundreds of dollars") plt.show() #infinite loop
commercials volume 0 2 24 1 5 28 2 1 22 3 3 26 4 4 25 5 1 24 6 5 26
"Create a Series with a pd.DatetimeIndex. Compute the percent change in each row." import sys import pandas as pd start = pd.Timestamp(year = 2020, month = 12, day = 25) #or start = pd.Timestamp("2020-12-25") end = pd.Timestamp(year = 2020, month = 12, day = 31) index = pd.date_range(start = start, end = end, freq = "1D", name = "date") #or index = pd.date_range("2020-12-25", "2020-12-31", name = "date") data = [25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0] series = pd.Series(data = data, index = index, name = "Prices") s = series.to_string(dtype = True, float_format = lambda price: f"${price:.2f}", length = True, name = True) print(s) print() #Examine the index in greater detail. print(f"{series.index = }") print() print(f"{type(series.index) = }") print(f"{series.index.dtype.name = }") print(f"{series.index.freqstr = }") print() seriesOfChanges = series.pct_change() seriesOfChanges.name = "Percent Change" s = seriesOfChanges.to_string(dtype = True, float_format = lambda change: f"{change:.4f} %", length = True, name = True) print(s) sys.exit(0)
date 2020-12-25 $25.00 2020-12-26 $26.00 2020-12-27 $27.00 2020-12-28 $28.00 2020-12-29 $29.00 2020-12-30 $30.00 2020-12-31 $31.00 Freq: D, Name: Prices, Length: 7, dtype: float64 series.index = DatetimeIndex(['2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31'], dtype='datetime64[ns]', name='date', freq='D') type(series.index) = <class 'pandas.core.indexes.datetimes.DatetimeIndex'> series.index.dtype.name = 'datetime64[ns]' series.index.freqstr = 'D' date 2020-12-25 nan % 2020-12-26 0.0400 % 2020-12-27 0.0385 % 2020-12-28 0.0370 % 2020-12-29 0.0357 % 2020-12-30 0.0345 % 2020-12-31 0.0333 % Freq: D, Name: Percent Change, Length: 7, dtype: float64
pip3 install pandas-datareader pip3 show pandas-datareader Name: pandas-datareader Version: 0.8.1 Summary: Data readers extracted from the pandas codebase,should be compatible with recent pandas versions Home-page: https://github.com/pydata/pandas-datareader Author: The PyData Development Team Author-email: pydata@googlegroups.com License: BSD License Location: /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages Requires: lxml, pandas, requests Required-by:
""" How closely correlated is Apple with Google? closeAAPL, closeGOOG, changeAAPL, changeGOOG are Serieses. """ import sys import math import pandas as pd import pandas_datareader df = pandas_datareader.data.get_data_yahoo(symbols = "AAPL") #df is a pd.DataFrame. closeAAPL = df["Adj Close"] #Get the Adjusted Close column of the DataFrame. closeAAPL.name = "AAPL Adj Close" df = pandas_datareader.data.get_data_yahoo(symbols = "GOOG") closeGOOG = df["Adj Close"] closeGOOG.name = "GOOG Adj Close" pd.set_option("max_rows", 6) print(closeAAPL) print() print(closeGOOG) print() changeAAPL = closeAAPL.pct_change() #percent change changeGOOG = closeGOOG.pct_change() changeAAPL.name = "Percent Change AAPL" changeGOOG.name = "Percent Change GOOG" print(changeAAPL) print() print(changeGOOG) print() print(f"{changeAAPL.corr(changeGOOG) = }") print(f"{changeAAPL.cov(changeGOOG) = }") sys.exit(0)
Unfortunately
the first row of the output of
Series.pct_change
is always
np.nan
,
because there is no previous row to compare the first row with.
Date 2014-12-12 100.821831 2014-12-15 99.443619 2014-12-16 98.083740 ... 2019-12-09 266.920013 2019-12-10 268.480011 2019-12-11 268.850006 Name: AAPL Adj Close, Length: 1258, dtype: float64 Date 2014-12-12 517.239929 2014-12-15 512.393250 2014-12-16 494.033630 ... 2019-12-09 1343.560059 2019-12-10 1344.660034 2019-12-11 1345.785034 Name: GOOG Adj Close, Length: 1258, dtype: float64 Date 2014-12-12 NaN 2014-12-15 -0.013670 2014-12-16 -0.013675 ... 2019-12-09 -0.014000 2019-12-10 0.005844 2019-12-11 0.001378 Name: Percent Change AAPL, Length: 1258, dtype: float64 Date 2014-12-12 NaN 2014-12-15 -0.009370 2014-12-16 -0.035831 ... 2019-12-09 0.002193 2019-12-10 0.000819 2019-12-11 0.000837 Name: Percent Change GOOG, Length: 1258, dtype: float64 changeAAPL.corr(changeGOOG) = 0.5233805904933649 changeAAPL.cov(changeGOOG) = 0.00012464409418096498
Here’s how I found out that there is a
"Volume"
column in addition to the
"Adj Close"
column.
df = pandas_datareader.data.get_data_yahoo("AAPL") print(f"{df.columns = }")
df.columns = Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')Is there any correlation between the volume and the adjusted close?