A
pd.DataFrame
object contains one or more columns of information.
Each column is a
pd.Series
,
and all the
pd.Series
es
share the same index.
If you don’t specify an index,
the index of a
pd.DataFrame
,
like the index of a
pd.Series
,
defaults to a
RangeIndex
.
In addition to the familiar verical index
that provides an identifying number or name for each row,
a
pd.DataFrame
has a horizontal index
that provides an identifying number or name for each column.
"Create a pd.DataFrame object. Examine its two default indices, vertical and horizontal." import sys import pandas as pd data = [ #three rows and four columns [ 0.0, 1.0, 2.0, 3.0], [10.0, 11.0, 12.0, 13.0], [20.0, 21.0, 22.0, 23.0] ] df = pd.DataFrame(data = data) print(df) #means print(df.to_string()) print() print(f"{df.index = }") print(f"{df.columns = }") print() print(df.dtypes) #a pd.Series giving the dtype of each column print() df.index = pd.Index(data = ["row0", "row1", "row2"], name = "rows") df.columns = pd.Index(data = ["col0", "col1", "col2", "col3"], name = "cols") print(df) sys.exit(0)
0 1 2 3 0 0.0 1.0 2.0 3.0 1 10.0 11.0 12.0 13.0 2 20.0 21.0 22.0 23.0 df.index = RangeIndex(start=0, stop=3, step=1) df.columns = RangeIndex(start=0, stop=4, step=1) 0 float64 1 float64 2 float64 3 float64 dtype: object cols col0 col1 col2 col3 rows row0 0.0 1.0 2.0 3.0 row1 10.0 11.0 12.0 13.0 row2 20.0 21.0 22.0 23.0
"Create a pd.DataFrame object. Examine its two default indices, vertical and horizontal." import sys import pandas as pd data = [ #three rows and four columns [ 0.0, 1.0, 2.0, 3.0], [10.0, 11.0, 12.0, 13.0], [20.0, 21.0, 22.0, 23.0] ] df = pd.DataFrame(data = data) df.index = pd.Index(data = ["row0", "row1", "row2"], name = "rows") df.columns = pd.Index(data = ["col0", "col1", "col2", "col3"], name = "cols") print(df) print() series = df["col2"] #Get one of the columns. print(series) print() series = df.loc["row2"] #Get one of the rows. Could also say df.iloc[2] print(series) #A pd.Series always prints vertically, even if originally a row. print() print(f'{df.at["row2", "col2"] = }') #Get an individual value. print(f'{df.iat[2, 2] = }') sys.exit(0)
cols col0 col1 col2 col3 rows row0 0.0 1.0 2.0 3.0 row1 10.0 11.0 12.0 13.0 row2 20.0 21.0 22.0 23.0 rows row0 2.0 row1 12.0 row2 22.0 Name: col2, dtype: float64 cols col0 20.0 col1 21.0 col2 22.0 col3 23.0 Name: row2, dtype: float64 df.at["row2", "col2"] = 22.0 df.iat[2, 2] = 22.0
data
to one of the following.
The first one does the work in Python.
The second one does is faster because it ultimately does the work in C.
nrows = 3 #number of rows ncols = 4 #number of columns #Python list comprehension. #data is a nrows by ncols list. data = [[float(10 * row + col) for col in range(ncols)] for row in range(nrows)]
#remember to import numpy as np nrows = 3 #number of rows ncols = 4 #number of columns ones = np.arange(ncols, dtype = np.float64) tens = np.arange(0, 10 * nrows, 10, dtype = np.float64) #Create three np.ndarrays. Each is nrows by ncols. ones, tens = np.meshgrid(ones, tens) data = ones + tens print(ones) #just to help you understand print() print(tens) print() print(data) print()
[[0. 1. 2. 3.] [0. 1. 2. 3.] [0. 1. 2. 3.]] [[ 0. 0. 0. 0.] [10. 10. 10. 10.] [20. 20. 20. 20.]] [[ 0. 1. 2. 3.] [10. 11. 12. 13.] [20. 21. 22. 23.]]You can also create
data
this way:
nrows = 3 #number of rows ncols = 4 #number of columns ones = np.arange(ncols, dtype = np.float64) tens = np.arange(0, 10 * nrows, 10, dtype = np.float64) t = np.meshgrid(ones, tens) #t is a tuple containing two np.ndarrays data = sum(t)or even this way:
nrows = 3 #number of rows ncols = 4 #number of columns ones = np.arange(ncols, dtype = np.float64) tens = np.arange(0, 10 * nrows, 10, dtype = np.float64) data = sum(np.meshgrid(ones, tens))
pd.DataFrame
that is wider than
pd.options.display.max_columns
.
See
Options
and settings.
The
options_context
function creates and returns a Python
context
manager.
See
Context
Manager Types.
"Print a pd.DataFrame that is wider than pd.options.display.max_columns." import sys import numpy as np import pandas as pd print(f'{pd.get_option("display.max_columns") = }') print(f'{pd.get_option("display.width") = }') print() nrows = 10 ncols = 16 ones = np.arange(ncols) hundreds = np.arange(0, 100 * nrows, 100) ones, hundreds = np.meshgrid(ones, hundreds) #ones is a nrows by ncols np.ndarray. So is hundreds. data = ones + hundreds #data is a nrows by ncols np.ndarray. df = pd.DataFrame(data = data) df.index = pd.RangeIndex(0, 100 * nrows, 100) print(df) print(80 * "-") print() with pd.option_context("display.max_columns", None): print(df) print(80 * "-") print() with pd.option_context("display.max_columns", 16, "display.width", None): print(df) sys.exit(0)
pd.get_option("display.max_columns") = 0 pd.get_option("display.width") = 80 0 1 2 3 4 5 6 ... 9 10 11 12 13 14 15 0 0 1 2 3 4 5 6 ... 9 10 11 12 13 14 15 100 100 101 102 103 104 105 106 ... 109 110 111 112 113 114 115 200 200 201 202 203 204 205 206 ... 209 210 211 212 213 214 215 300 300 301 302 303 304 305 306 ... 309 310 311 312 313 314 315 400 400 401 402 403 404 405 406 ... 409 410 411 412 413 414 415 500 500 501 502 503 504 505 506 ... 509 510 511 512 513 514 515 600 600 601 602 603 604 605 606 ... 609 610 611 612 613 614 615 700 700 701 702 703 704 705 706 ... 709 710 711 712 713 714 715 800 800 801 802 803 804 805 806 ... 809 810 811 812 813 814 815 900 900 901 902 903 904 905 906 ... 909 910 911 912 913 914 915 [10 rows x 16 columns] -------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 \ 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 100 100 101 102 103 104 105 106 107 108 109 110 111 112 113 200 200 201 202 203 204 205 206 207 208 209 210 211 212 213 300 300 301 302 303 304 305 306 307 308 309 310 311 312 313 400 400 401 402 403 404 405 406 407 408 409 410 411 412 413 500 500 501 502 503 504 505 506 507 508 509 510 511 512 513 600 600 601 602 603 604 605 606 607 608 609 610 611 612 613 700 700 701 702 703 704 705 706 707 708 709 710 711 712 713 800 800 801 802 803 804 805 806 807 808 809 810 811 812 813 900 900 901 902 903 904 905 906 907 908 909 910 911 912 913 14 15 0 14 15 100 114 115 200 214 215 300 314 315 400 414 415 500 514 515 600 614 615 700 714 715 800 814 815 900 914 915 -------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 100 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 200 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 300 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 400 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 500 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 600 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 700 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 800 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 900 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
pd.DataFrame
that contains
only the first five rows of the original
pd.DataFrame
.
"Create a new pd.DataFrame containing only the first 5 rows of the original pd.DataFrame." import sys import numpy as np import pandas as pd nrows = 7 #number of rows ncols = 6 #number of columns ones = np.arange(ncols) tens = np.arange(0, 10 * nrows, 10) ones, tens = np.meshgrid(ones, tens) data = ones + tens df = pd.DataFrame(data = data) print(df) print() print(df.head()) sys.exit(0)
0 1 2 3 4 5 0 0 1 2 3 4 5 1 10 11 12 13 14 15 2 20 21 22 23 24 25 3 30 31 32 33 34 35 4 40 41 42 43 44 45 5 50 51 52 53 54 55 6 60 61 62 63 64 65 0 1 2 3 4 5 0 0 1 2 3 4 5 1 10 11 12 13 14 15 2 20 21 22 23 24 25 3 30 31 32 33 34 35 4 40 41 42 43 44 45
Now create a new
pd.DataFrame
that contains
only the first five columns of the original
pd.DataFrame
.
"Create a new pd.DataFrame containing only the first 5 columns of the original pd.DataFrame." import sys import numpy as np import pandas as pd nrows = 7 #number of rows ncols = 6 #number of columns ones = np.arange(ncols) tens = np.arange(0, 10 * nrows, 10) ones, tens = np.meshgrid(ones, tens) data = ones + tens df = pd.DataFrame(data = data) print(df) print() print(df.T) #uppercase T for transpose print() print(df.T.head()) print() print(df.T.head().T) sys.exit(0)
0 1 2 3 4 5 0 0 1 2 3 4 5 1 10 11 12 13 14 15 2 20 21 22 23 24 25 3 30 31 32 33 34 35 4 40 41 42 43 44 45 5 50 51 52 53 54 55 6 60 61 62 63 64 65 0 1 2 3 4 5 6 0 0 10 20 30 40 50 60 1 1 11 21 31 41 51 61 2 2 12 22 32 42 52 62 3 3 13 23 33 43 53 63 4 4 14 24 34 44 54 64 5 5 15 25 35 45 55 65 0 1 2 3 4 5 6 0 0 10 20 30 40 50 60 1 1 11 21 31 41 51 61 2 2 12 22 32 42 52 62 3 3 13 23 33 43 53 63 4 4 14 24 34 44 54 64 0 1 2 3 4 0 0 1 2 3 4 1 10 11 12 13 14 2 20 21 22 23 24 3 30 31 32 33 34 4 40 41 42 43 44 5 50 51 52 53 54 6 60 61 62 63 64Make sure that
tail
works too.
pd.DataFrame
.
"Describe each column of a pd.DataFrame." import sys import numpy as np import pandas as pd nrows = 10 ncols = 5 ones = np.arange(ncols) tens = np.arange(0, 10 * nrows, 10) ones, tens = np.meshgrid(ones, tens) data = ones + tens df = pd.DataFrame(data = data) df.index = pd.RangeIndex(0, 10 * nrows, 10) print(df) print() print(df.describe()) sys.exit(0)
0 1 2 3 4 0 0 1 2 3 4 10 10 11 12 13 14 20 20 21 22 23 24 30 30 31 32 33 34 40 40 41 42 43 44 50 50 51 52 53 54 60 60 61 62 63 64 70 70 71 72 73 74 80 80 81 82 83 84 90 90 91 92 93 94 0 1 2 3 4 count 10.000000 10.000000 10.000000 10.000000 10.000000 mean 45.000000 46.000000 47.000000 48.000000 49.000000 std 30.276504 30.276504 30.276504 30.276504 30.276504 min 0.000000 1.000000 2.000000 3.000000 4.000000 25% 22.500000 23.500000 24.500000 25.500000 26.500000 50% 45.000000 46.000000 47.000000 48.000000 49.000000 75% 67.500000 68.500000 69.500000 70.500000 71.500000 max 90.000000 91.000000 92.000000 93.000000 94.000000
Now describe each of the ten rows.
print(df.T) #uppercase T for transpose print() with pd.option_context("display.width", None): print(df.T.describe()) print() print(df.T.describe().T)
0 10 20 30 40 50 60 70 80 90 0 0 10 20 30 40 50 60 70 80 90 1 1 11 21 31 41 51 61 71 81 91 2 2 12 22 32 42 52 62 72 82 92 3 3 13 23 33 43 53 63 73 83 93 4 4 14 24 34 44 54 64 74 84 94 0 10 20 30 40 50 60 70 80 90 count 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 mean 2.000000 12.000000 22.000000 32.000000 42.000000 52.000000 62.000000 72.000000 82.000000 92.000000 std 1.581139 1.581139 1.581139 1.581139 1.581139 1.581139 1.581139 1.581139 1.581139 1.581139 min 0.000000 10.000000 20.000000 30.000000 40.000000 50.000000 60.000000 70.000000 80.000000 90.000000 25% 1.000000 11.000000 21.000000 31.000000 41.000000 51.000000 61.000000 71.000000 81.000000 91.000000 50% 2.000000 12.000000 22.000000 32.000000 42.000000 52.000000 62.000000 72.000000 82.000000 92.000000 75% 3.000000 13.000000 23.000000 33.000000 43.000000 53.000000 63.000000 73.000000 83.000000 93.000000 max 4.000000 14.000000 24.000000 34.000000 44.000000 54.000000 64.000000 74.000000 84.000000 94.000000 count mean std min 25% 50% 75% max 0 5.0 2.0 1.581139 0.0 1.0 2.0 3.0 4.0 10 5.0 12.0 1.581139 10.0 11.0 12.0 13.0 14.0 20 5.0 22.0 1.581139 20.0 21.0 22.0 23.0 24.0 30 5.0 32.0 1.581139 30.0 31.0 32.0 33.0 34.0 40 5.0 42.0 1.581139 40.0 41.0 42.0 43.0 44.0 50 5.0 52.0 1.581139 50.0 51.0 52.0 53.0 54.0 60 5.0 62.0 1.581139 60.0 61.0 62.0 63.0 64.0 70 5.0 72.0 1.581139 70.0 71.0 72.0 73.0 74.0 80 5.0 82.0 1.581139 80.0 81.0 82.0 83.0 84.0 90 5.0 92.0 1.581139 90.0 91.0 92.0 93.0 94.0
pd.DataFrame
to a Python
list
?
I started the week with Monday, instead of Sunday,
because
datetime.weekday
start the week with Monday.
"Convert a pd.DataFrame to a Python list." import sys import pandas as pd data = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday" ] index = pd.Index(data = data, name = "weekday") data = [ "Spanish", "German", "Hebrew" ] columns = pd.Index(data = data, name = "language") data = [ ["lunes", "Montag", "יוֹם שֵׁנִי"], ["martes", "Dienstag", "יוֹם שְׁלִישִׁי"], ["miércoles", "Mittwoch", "יוֹם רְבִיעִי"], ["jueves", "Donnerstag", "יוֹם חֲמִישִׁי"], ["viernes", "Freitag", "יוֹם שִׁשִּׁי"], ["sábado", "Samstag", "יוֹם שַׁבָּת"], ["domingo", "Sonntag", "יוֹם רִאשׁוֹן"] ] df = pd.DataFrame(data = data, index = index, columns = columns) print(df) print() for colname in list(df): #colname is a string print(colname) sys.exit(0)
language Spanish German Hebrew weekday Monday lunes Montag יוֹם שֵׁנִי Tuesday martes Dienstag יוֹם שְׁלִישִׁי Wednesday miércoles Mittwoch יוֹם רְבִיעִי Thursday jueves Donnerstag יוֹם חֲמִישִׁי Friday viernes Freitag יוֹם שִׁשִּׁי Saturday sábado Samstag יוֹם שַׁבָּת Sunday domingo Sonntag יוֹם רִאשׁוֹן Spanish German Hebrew
What do you get when you convert the
pd.DataFrame
to a Python
dict
?
for weekday, series in dict(df).items(): #weekday is a string, series is a pd.Series print(weekday) print(series) print()
Spanish weekday Monday lunes Tuesday martes Wednesday miércoles Thursday jueves Friday viernes Saturday sábado Sunday domingo Name: Spanish, dtype: object German weekday Monday Montag Tuesday Dienstag Wednesday Mittwoch Thursday Donnerstag Friday Freitag Saturday Samstag Sunday Sonntag Name: German, dtype: object Hebrew weekday Monday יוֹם שֵׁנִי Tuesday יוֹם שְׁלִישִׁי Wednesday יוֹם רְבִיעִי Thursday יוֹם חֲמִישִׁי Friday יוֹם שִׁשִּׁי Saturday יוֹם שַׁבָּת Sunday יוֹם רִאשׁוֹן Name: Hebrew, dtype: object
pd.DataFrame
.
"Plot a pd.DataFrame with matplotlib.pyplot." import pandas as pd import matplotlib.pyplot as plt data = [ "New York", "Yonkers", "Pougheepsie", "Albany" ] columns = pd.Index(data = data, name = "City") index = pd.RangeIndex(1, 7, name = "day of month") #Gets colder as you go north. #Gets warmer as the month progresses. data = [ [ 6, 4, 2, 0], [16, 14, 12, 10], [26, 24, 22, 20], [36, 34, 32, 30], [46, 44, 42, 40], [56, 54, 52, 50] ] df = pd.DataFrame(data = data, index = index, columns = columns) print(df) axes = df.plot(grid = True, figsize = [6.4, 4.8]) #width and height in inches figure = plt.gcf() figure.canvas.set_window_title("matplotlib.pyplot DataFrame.plot") axes.set_title("Daily Temperature in March") axes.set_ylabel("temperature in Fahrenheit") plt.show() #infinite loop
City New York Yonkers Pougheepsie Albany day of month 1 6 4 2 0 2 16 14 12 10 3 26 24 22 20 4 36 34 32 30 5 46 44 42 40 6 56 54 52 50
pd.DataFrame
.