Reduce a pd.Series

A reduction or aggregation is an operation that takes an entire Series and produces a result that is a single number (or a single string).

import sys
import numpy as np
import pandas as pd

data = [10.0, 20.0, 30.0, 40.0, 50.0]
series = pd.Series(data = data, name = "prices")
series.index.name = "day"
print(series)
print()

print(f"{series.count() = }")
print(f"{series.sum() = }")    #also try prod
print(f"{series.mean() = }")
print(f"{series.median() = }")
print()

print(f"{series.min() = }")
print(f"{series.max() = }")
print(f"{series.idxmin() = }")         #index of the smallest value
print(f"{series.idxmax() = }")
print(f"{np.argmin(series.array) = }") #position (an integer, like iloc) of the smallest value
print(f"{np.argmax(series.array) = }")
print()

print(series.describe())       #returns a Series containing 8 rows
sys.exit(0)
day
0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
Name: prices, dtype: float64

series.count() = 5
series.sum() = 150.0
series.mean() = 30.0
series.median() = 30.0

series.min() = 10.0
series.max() = 50.0
series.idxmin() = 0
series.idxmax() = 4
np.argmin(series.array) = 0
np.argmax(series.array) = 4

count     5.000000
mean     30.000000
std      15.811388
min      10.000000
25%      20.000000
50%      30.000000
75%      40.000000
max      50.000000
Name: prices, dtype: float64

Does “not a number” count as a row?
Series.size vs. Series.count

import sys
import numpy as np
import pandas as pd

data = [0.0, 10.0, np.nan, 30.0, None]   #not a number
series = pd.Series(data = data)
print(series)
print()


print(f"{len(series) = }")
print(f"{series.size = }")   #also try series.shape
print(f"{series.count() = }")
print()

print(f"{series.sum() = }")  #ignores np.nan
print(f"{series.sum(skipna = False) = }")
sys.exit(0)
0     0.0
1    10.0
2     NaN
3    30.0
4     NaN
dtype: float64

len(series) = 5
series.size = 5
series.count() = 3

series.sum() = 40.0
series.sum(skipna = False) = nan

Covariance and correlation

"Covariance and correlation."

import sys
import math
import pandas as pd

#data about N stores
N = 7

data = [
    [ 2,  5,  1,  3,  4,  1,  5],   #number of commercials for each store
    [24, 28, 22, 26, 25, 24, 26]    #sales volume in hundreds of dollars
]

series0 = pd.Series(data = data[0], name = "Number of Commercials")
series1 = pd.Series(data = data[1], name = "Sales Volume in Hundreds")

print("sample variance:")
print(series0.var())
print(((series0 - series0.mean()) ** 2).sum() / (N - 1))
print()

print("sample standard deviation:")
print(series0.std())
print(math.sqrt(series0.var()))
print()

print("sample covariance:")
print(series0.cov(series1))
difference0 = series0 - series0.mean()
difference1 = series1 - series1.mean()
print((difference0 * difference1).sum() / (N - 1))
print()

print("sample correlation coefficient:")
print(series0.corr(series1))
print(series0.cov(series1) / (series0.std() * series1.std()))

sys.exit(0)
sample variance:
3.0
3.0

sample standard deviation:
1.7320508075688772
1.7320508075688772

sample covariance:
2.833333333333333
2.8333333333333335

sample correlation coefficiant:
0.8542821429703302
0.8542821429703302

Scatter plot

"Scatter plot of a pair of pd.Serieses."

import pandas as pd
import matplotlib.pyplot as plt

#data about 7 stores

data = [
    [ 2,  5,  1,  3,  4,  1,  5],   #number of commercials for each store
    [24, 28, 22, 26, 25, 24, 26]    #sales volume in hundreds of dollars
]

series0 = pd.Series(data = data[0], name = "commercials")
series1 = pd.Series(data = data[1], name = "volume")

df = pd.concat([series0, series1], axis = 1) #Create a pd.DataFrame containing 2 columns.
print(df)

axes = df.plot.scatter(
    x = "commercials",
    y = "volume",
    figsize = [6.4, 4.8], #DataFrame.plot.scatter creates a new Figure.
    color = "#1f77b4",    #red, green, blue
    grid = False,
    marker = "o",         #style of marker; also try "s"
    s = 25                #size of marker in points
)

figure = plt.gcf()
figure.canvas.set_window_title("matplotlib DataFrame.plot.scatter")
axes.set_title("Scatter Plot")
axes.set_xlabel("number of commercials")
axes.set_ylabel("sales volume in hundreds of dollars")

plt.show()   #infinite loop
   commercials  volume
0            2      24
1            5      28
2            1      22
3            3      26
4            4      25
5            1      24
6            5      26

A time series

"Create a Series with a pd.DatetimeIndex.  Compute the percent change in each row."

import sys
import pandas as pd

start = pd.Timestamp(year = 2020, month = 12, day = 25) #or start = pd.Timestamp("2020-12-25")
end   = pd.Timestamp(year = 2020, month = 12, day = 31)

index = pd.date_range(start = start, end = end, freq = "1D", name = "date")
#or index = pd.date_range("2020-12-25", "2020-12-31", name = "date")

data = [25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]
series = pd.Series(data = data, index = index, name = "Prices")
s = series.to_string(dtype = True, float_format = lambda price: f"${price:.2f}", length = True, name = True)
print(s)
print()

#Examine the index in greater detail.
print(f"{series.index = }")
print()
print(f"{type(series.index) = }")
print(f"{series.index.dtype.name = }")
print(f"{series.index.freqstr = }")
print()

seriesOfChanges = series.pct_change()
seriesOfChanges.name = "Percent Change"
s = seriesOfChanges.to_string(dtype = True, float_format = lambda change: f"{change:.4f} %", length = True, name = True)
print(s)

sys.exit(0)
date
2020-12-25   $25.00
2020-12-26   $26.00
2020-12-27   $27.00
2020-12-28   $28.00
2020-12-29   $29.00
2020-12-30   $30.00
2020-12-31   $31.00
Freq: D, Name: Prices, Length: 7, dtype: float64

series.index = DatetimeIndex(['2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28',
               '2020-12-29', '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', name='date', freq='D')

type(series.index) = <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
series.index.dtype.name = 'datetime64[ns]'
series.index.freqstr = 'D'

date
2020-12-25      nan %
2020-12-26   0.0400 %
2020-12-27   0.0385 %
2020-12-28   0.0370 %
2020-12-29   0.0357 %
2020-12-30   0.0345 %
2020-12-31   0.0333 %
Freq: D, Name: Percent Change, Length: 7, dtype: float64

Covariance and correlation between two time serieses

pip3 install pandas-datareader

pip3 show pandas-datareader
Name: pandas-datareader
Version: 0.8.1
Summary: Data readers extracted from the pandas codebase,should be compatible with recent pandas versions
Home-page: https://github.com/pydata/pandas-datareader
Author: The PyData Development Team
Author-email: pydata@googlegroups.com
License: BSD License
Location: /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages
Requires: lxml, pandas, requests
Required-by:
"""
How closely correlated is Apple with Google?
closeAAPL, closeGOOG, changeAAPL, changeGOOG are Serieses.
"""

import sys
import math
import pandas as pd
import pandas_datareader

df = pandas_datareader.data.get_data_yahoo(symbols = "AAPL") #df is a pd.DataFrame.
closeAAPL = df["Adj Close"]   #Get the Adjusted Close column of the DataFrame.
closeAAPL.name = "AAPL Adj Close"

df = pandas_datareader.data.get_data_yahoo(symbols = "GOOG")
closeGOOG = df["Adj Close"]
closeGOOG.name = "GOOG Adj Close"

pd.set_option("max_rows", 6)
print(closeAAPL)
print()
print(closeGOOG)
print()

changeAAPL = closeAAPL.pct_change()   #percent change
changeGOOG = closeGOOG.pct_change()
changeAAPL.name = "Percent Change AAPL"
changeGOOG.name = "Percent Change GOOG"

print(changeAAPL)
print()
print(changeGOOG)
print()

print(f"{changeAAPL.corr(changeGOOG) = }")
print(f"{changeAAPL.cov(changeGOOG)  = }")
sys.exit(0)

Unfortunately the first row of the output of Series.pct_change is always np.nan, because there is no previous row to compare the first row with.

Date
2014-12-12    100.821831
2014-12-15     99.443619
2014-12-16     98.083740
                 ...
2019-12-09    266.920013
2019-12-10    268.480011
2019-12-11    268.850006
Name: AAPL Adj Close, Length: 1258, dtype: float64

Date
2014-12-12     517.239929
2014-12-15     512.393250
2014-12-16     494.033630
                 ...
2019-12-09    1343.560059
2019-12-10    1344.660034
2019-12-11    1345.785034
Name: GOOG Adj Close, Length: 1258, dtype: float64

Date
2014-12-12         NaN
2014-12-15   -0.013670
2014-12-16   -0.013675
                ...
2019-12-09   -0.014000
2019-12-10    0.005844
2019-12-11    0.001378
Name: Percent Change AAPL, Length: 1258, dtype: float64

Date
2014-12-12         NaN
2014-12-15   -0.009370
2014-12-16   -0.035831
                ...
2019-12-09    0.002193
2019-12-10    0.000819
2019-12-11    0.000837
Name: Percent Change GOOG, Length: 1258, dtype: float64

changeAAPL.corr(changeGOOG) = 0.5233805904933649
changeAAPL.cov(changeGOOG)  = 0.00012464409418096498

Here’s how I found out that there is a "Volume" column in addition to the "Adj Close" column.

df = pandas_datareader.data.get_data_yahoo("AAPL")
print(f"{df.columns = }")
df.columns = Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')
Is there any correlation between the volume and the adjusted close?