Create a pd.DataFrame object

A pd.DataFrame object contains one or more columns of information. Each column is a pd.Series, and all the pd.Serieses share the same index. If you don’t specify an index, the index of a pd.DataFrame, like the index of a pd.Series, defaults to a RangeIndex.

In addition to the familiar verical index that provides an identifying number or name for each row, a pd.DataFrame has a horizontal index that provides an identifying number or name for each column.

"Create a pd.DataFrame object.  Examine its two default indices, vertical and horizontal."

import sys
import pandas as pd

data = [                       #three rows and four columns
    [ 0.0,  1.0,  2.0,  3.0],
    [10.0, 11.0, 12.0, 13.0],
    [20.0, 21.0, 22.0, 23.0]
]

df = pd.DataFrame(data = data)
print(df)                      #means print(df.to_string())
print()

print(f"{df.index   = }")
print(f"{df.columns = }")
print()

print(df.dtypes)               #a pd.Series giving the dtype of each column
print()

df.index   = pd.Index(data = ["row0", "row1", "row2"],         name = "rows")
df.columns = pd.Index(data = ["col0", "col1", "col2", "col3"], name = "cols")

print(df)
sys.exit(0)
      0     1     2     3
0   0.0   1.0   2.0   3.0
1  10.0  11.0  12.0  13.0
2  20.0  21.0  22.0  23.0

df.index   = RangeIndex(start=0, stop=3, step=1)
df.columns = RangeIndex(start=0, stop=4, step=1)

0    float64
1    float64
2    float64
3    float64
dtype: object

cols  col0  col1  col2  col3
rows
row0   0.0   1.0   2.0   3.0
row1  10.0  11.0  12.0  13.0
row2  20.0  21.0  22.0  23.0

Select a column, row, or individual value.

"Create a pd.DataFrame object.  Examine its two default indices, vertical and horizontal."

import sys
import pandas as pd

data = [                       #three rows and four columns
    [ 0.0,  1.0,  2.0,  3.0],
    [10.0, 11.0, 12.0, 13.0],
    [20.0, 21.0, 22.0, 23.0]
]

df = pd.DataFrame(data = data)

df.index   = pd.Index(data = ["row0", "row1", "row2"],         name = "rows")
df.columns = pd.Index(data = ["col0", "col1", "col2", "col3"], name = "cols")

print(df)
print()

series = df["col2"]     #Get one of the columns.
print(series)
print()

series = df.loc["row2"] #Get one of the rows.  Could also say df.iloc[2]
print(series)           #A pd.Series always prints vertically, even if originally a row.
print()

print(f'{df.at["row2", "col2"] = }')   #Get an individual value.
print(f'{df.iat[2, 2]          = }')

sys.exit(0)
cols  col0  col1  col2  col3
rows
row0   0.0   1.0   2.0   3.0
row1  10.0  11.0  12.0  13.0
row2  20.0  21.0  22.0  23.0

rows
row0     2.0
row1    12.0
row2    22.0
Name: col2, dtype: float64

cols
col0    20.0
col1    21.0
col2    22.0
col3    23.0
Name: row2, dtype: float64

df.at["row2", "col2"] = 22.0
df.iat[2, 2]          = 22.0

Things to try

  1. Change the above data to one of the following. The first one does the work in Python. The second one does is faster because it ultimately does the work in C.
    nrows = 3   #number of rows
    ncols = 4   #number of columns
    
    #Python list comprehension.
    #data is a nrows by ncols list.
    
    data = [[float(10 * row + col) for col in range(ncols)] for row in range(nrows)]
    
    #remember to import numpy as np
    
    nrows = 3   #number of rows
    ncols = 4   #number of columns
    
    ones = np.arange(ncols, dtype = np.float64)
    tens = np.arange(0, 10 * nrows, 10, dtype = np.float64)
    
    #Create three np.ndarrays.  Each is nrows by ncols.
    ones, tens = np.meshgrid(ones, tens)
    data = ones + tens
    
    print(ones)   #just to help you understand
    print()
    print(tens)
    print()
    print(data)
    print()
    
    [[0. 1. 2. 3.]
     [0. 1. 2. 3.]
     [0. 1. 2. 3.]]
    
    [[ 0.  0.  0.  0.]
     [10. 10. 10. 10.]
     [20. 20. 20. 20.]]
    
    [[ 0.  1.  2.  3.]
     [10. 11. 12. 13.]
     [20. 21. 22. 23.]]
    
    You can also create data this way:
    nrows = 3   #number of rows
    ncols = 4   #number of columns
    
    ones = np.arange(ncols, dtype = np.float64)
    tens = np.arange(0, 10 * nrows, 10, dtype = np.float64)
    t = np.meshgrid(ones, tens)   #t is a tuple containing two np.ndarrays
    data = sum(t)
    
    or even this way:
    nrows = 3   #number of rows
    ncols = 4   #number of columns
    
    ones = np.arange(ncols, dtype = np.float64)
    tens = np.arange(0, 10 * nrows, 10, dtype = np.float64)
    data = sum(np.meshgrid(ones, tens))
    
  2. Print a pd.DataFrame that is wider than pd.options.display.max_columns. See Options and settings. The options_context function creates and returns a Python context manager. See Context Manager Types.
    "Print a pd.DataFrame that is wider than pd.options.display.max_columns."
    
    import sys
    import numpy as np
    import pandas as pd
    
    print(f'{pd.get_option("display.max_columns") = }')
    print(f'{pd.get_option("display.width") = }')
    print()
    
    nrows = 10
    ncols = 16
    
    ones = np.arange(ncols)
    hundreds = np.arange(0, 100 * nrows, 100)
    ones, hundreds = np.meshgrid(ones, hundreds) #ones is a nrows by ncols np.ndarray.  So is hundreds.
    data = ones + hundreds                       #data is a nrows by ncols np.ndarray.
    
    df = pd.DataFrame(data = data)
    df.index = pd.RangeIndex(0, 100 * nrows, 100)
    print(df)
    print(80 * "-")
    print()
    
    with pd.option_context("display.max_columns", None):
        print(df)
    print(80 * "-")
    print()
    
    with pd.option_context("display.max_columns", 16, "display.width", None):
        print(df)
    
    sys.exit(0)
    
    pd.get_option("display.max_columns") = 0
    pd.get_option("display.width") = 80
    
          0    1    2    3    4    5    6   ...   9    10   11   12   13   14   15
    0      0    1    2    3    4    5    6  ...    9   10   11   12   13   14   15
    100  100  101  102  103  104  105  106  ...  109  110  111  112  113  114  115
    200  200  201  202  203  204  205  206  ...  209  210  211  212  213  214  215
    300  300  301  302  303  304  305  306  ...  309  310  311  312  313  314  315
    400  400  401  402  403  404  405  406  ...  409  410  411  412  413  414  415
    500  500  501  502  503  504  505  506  ...  509  510  511  512  513  514  515
    600  600  601  602  603  604  605  606  ...  609  610  611  612  613  614  615
    700  700  701  702  703  704  705  706  ...  709  710  711  712  713  714  715
    800  800  801  802  803  804  805  806  ...  809  810  811  812  813  814  815
    900  900  901  902  903  904  905  906  ...  909  910  911  912  913  914  915
    
    [10 rows x 16 columns]
    --------------------------------------------------------------------------------
    
           0    1    2    3    4    5    6    7    8    9   10   11   12   13  \
    0      0    1    2    3    4    5    6    7    8    9   10   11   12   13
    100  100  101  102  103  104  105  106  107  108  109  110  111  112  113
    200  200  201  202  203  204  205  206  207  208  209  210  211  212  213
    300  300  301  302  303  304  305  306  307  308  309  310  311  312  313
    400  400  401  402  403  404  405  406  407  408  409  410  411  412  413
    500  500  501  502  503  504  505  506  507  508  509  510  511  512  513
    600  600  601  602  603  604  605  606  607  608  609  610  611  612  613
    700  700  701  702  703  704  705  706  707  708  709  710  711  712  713
    800  800  801  802  803  804  805  806  807  808  809  810  811  812  813
    900  900  901  902  903  904  905  906  907  908  909  910  911  912  913
    
          14   15
    0     14   15
    100  114  115
    200  214  215
    300  314  315
    400  414  415
    500  514  515
    600  614  615
    700  714  715
    800  814  815
    900  914  915
    --------------------------------------------------------------------------------
    
           0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
    0      0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
    100  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115
    200  200  201  202  203  204  205  206  207  208  209  210  211  212  213  214  215
    300  300  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315
    400  400  401  402  403  404  405  406  407  408  409  410  411  412  413  414  415
    500  500  501  502  503  504  505  506  507  508  509  510  511  512  513  514  515
    600  600  601  602  603  604  605  606  607  608  609  610  611  612  613  614  615
    700  700  701  702  703  704  705  706  707  708  709  710  711  712  713  714  715
    800  800  801  802  803  804  805  806  807  808  809  810  811  812  813  814  815
    900  900  901  902  903  904  905  906  907  908  909  910  911  912  913  914  915
    
  3. Create a new pd.DataFrame that contains only the first five rows of the original pd.DataFrame.
    "Create a new pd.DataFrame containing only the first 5 rows of the original pd.DataFrame."
    
    import sys
    import numpy as np
    import pandas as pd
    
    nrows = 7   #number of rows
    ncols = 6   #number of columns
    
    ones = np.arange(ncols)
    tens = np.arange(0, 10 * nrows, 10)
    ones, tens = np.meshgrid(ones, tens)
    data = ones + tens
    
    df = pd.DataFrame(data = data)
    print(df)
    print()
    
    print(df.head())
    sys.exit(0)
    
        0   1   2   3   4   5
    0   0   1   2   3   4   5
    1  10  11  12  13  14  15
    2  20  21  22  23  24  25
    3  30  31  32  33  34  35
    4  40  41  42  43  44  45
    5  50  51  52  53  54  55
    6  60  61  62  63  64  65
    
        0   1   2   3   4   5
    0   0   1   2   3   4   5
    1  10  11  12  13  14  15
    2  20  21  22  23  24  25
    3  30  31  32  33  34  35
    4  40  41  42  43  44  45
    

    Now create a new pd.DataFrame that contains only the first five columns of the original pd.DataFrame.

    "Create a new pd.DataFrame containing only the first 5 columns of the original pd.DataFrame."
    
    import sys
    import numpy as np
    import pandas as pd
    
    nrows = 7   #number of rows
    ncols = 6   #number of columns
    
    ones = np.arange(ncols)
    tens = np.arange(0, 10 * nrows, 10)
    ones, tens = np.meshgrid(ones, tens)
    data = ones + tens
    
    df = pd.DataFrame(data = data)
    print(df)
    print()
    
    print(df.T)   #uppercase T for transpose
    print()
    
    print(df.T.head())
    print()
    
    print(df.T.head().T)
    sys.exit(0)
    
        0   1   2   3   4   5
    0   0   1   2   3   4   5
    1  10  11  12  13  14  15
    2  20  21  22  23  24  25
    3  30  31  32  33  34  35
    4  40  41  42  43  44  45
    5  50  51  52  53  54  55
    6  60  61  62  63  64  65
    
       0   1   2   3   4   5   6
    0  0  10  20  30  40  50  60
    1  1  11  21  31  41  51  61
    2  2  12  22  32  42  52  62
    3  3  13  23  33  43  53  63
    4  4  14  24  34  44  54  64
    5  5  15  25  35  45  55  65
    
       0   1   2   3   4   5   6
    0  0  10  20  30  40  50  60
    1  1  11  21  31  41  51  61
    2  2  12  22  32  42  52  62
    3  3  13  23  33  43  53  63
    4  4  14  24  34  44  54  64
    
        0   1   2   3   4
    0   0   1   2   3   4
    1  10  11  12  13  14
    2  20  21  22  23  24
    3  30  31  32  33  34
    4  40  41  42  43  44
    5  50  51  52  53  54
    6  60  61  62  63  64
    
    Make sure that tail works too.
  4. Describe each of the five columns of the following pd.DataFrame.
  5. "Describe each column of a pd.DataFrame."
    
    import sys
    import numpy as np
    import pandas as pd
    
    nrows = 10
    ncols = 5
    
    ones = np.arange(ncols)
    tens = np.arange(0, 10 * nrows, 10)
    ones, tens = np.meshgrid(ones, tens)
    data = ones + tens
    
    df = pd.DataFrame(data = data)
    df.index = pd.RangeIndex(0, 10 * nrows, 10)
    print(df)
    print()
    
    print(df.describe())
    sys.exit(0)
    
         0   1   2   3   4
    0    0   1   2   3   4
    10  10  11  12  13  14
    20  20  21  22  23  24
    30  30  31  32  33  34
    40  40  41  42  43  44
    50  50  51  52  53  54
    60  60  61  62  63  64
    70  70  71  72  73  74
    80  80  81  82  83  84
    90  90  91  92  93  94
    
                   0          1          2          3          4
    count  10.000000  10.000000  10.000000  10.000000  10.000000
    mean   45.000000  46.000000  47.000000  48.000000  49.000000
    std    30.276504  30.276504  30.276504  30.276504  30.276504
    min     0.000000   1.000000   2.000000   3.000000   4.000000
    25%    22.500000  23.500000  24.500000  25.500000  26.500000
    50%    45.000000  46.000000  47.000000  48.000000  49.000000
    75%    67.500000  68.500000  69.500000  70.500000  71.500000
    max    90.000000  91.000000  92.000000  93.000000  94.000000
    

    Now describe each of the ten rows.

    print(df.T)   #uppercase T for transpose
    print()
    
    with pd.option_context("display.width", None):
        print(df.T.describe())
    print()
    
    print(df.T.describe().T)
    
       0  10  20  30  40  50  60  70  80  90
    0  0  10  20  30  40  50  60  70  80  90
    1  1  11  21  31  41  51  61  71  81  91
    2  2  12  22  32  42  52  62  72  82  92
    3  3  13  23  33  43  53  63  73  83  93
    4  4  14  24  34  44  54  64  74  84  94
    
                 0          10         20         30         40         50         60         70         80         90
    count  5.000000   5.000000   5.000000   5.000000   5.000000   5.000000   5.000000   5.000000   5.000000   5.000000
    mean   2.000000  12.000000  22.000000  32.000000  42.000000  52.000000  62.000000  72.000000  82.000000  92.000000
    std    1.581139   1.581139   1.581139   1.581139   1.581139   1.581139   1.581139   1.581139   1.581139   1.581139
    min    0.000000  10.000000  20.000000  30.000000  40.000000  50.000000  60.000000  70.000000  80.000000  90.000000
    25%    1.000000  11.000000  21.000000  31.000000  41.000000  51.000000  61.000000  71.000000  81.000000  91.000000
    50%    2.000000  12.000000  22.000000  32.000000  42.000000  52.000000  62.000000  72.000000  82.000000  92.000000
    75%    3.000000  13.000000  23.000000  33.000000  43.000000  53.000000  63.000000  73.000000  83.000000  93.000000
    max    4.000000  14.000000  24.000000  34.000000  44.000000  54.000000  64.000000  74.000000  84.000000  94.000000
    
        count  mean       std   min   25%   50%   75%   max
    0     5.0   2.0  1.581139   0.0   1.0   2.0   3.0   4.0
    10    5.0  12.0  1.581139  10.0  11.0  12.0  13.0  14.0
    20    5.0  22.0  1.581139  20.0  21.0  22.0  23.0  24.0
    30    5.0  32.0  1.581139  30.0  31.0  32.0  33.0  34.0
    40    5.0  42.0  1.581139  40.0  41.0  42.0  43.0  44.0
    50    5.0  52.0  1.581139  50.0  51.0  52.0  53.0  54.0
    60    5.0  62.0  1.581139  60.0  61.0  62.0  63.0  64.0
    70    5.0  72.0  1.581139  70.0  71.0  72.0  73.0  74.0
    80    5.0  82.0  1.581139  80.0  81.0  82.0  83.0  84.0
    90    5.0  92.0  1.581139  90.0  91.0  92.0  93.0  94.0
    
  6. What do you get when you convert a pd.DataFrame to a Python list? I started the week with Monday, instead of Sunday, because datetime.weekday start the week with Monday.
    "Convert a pd.DataFrame to a Python list."
    import sys
    import pandas as pd
    
    data = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday"
    ]
    index = pd.Index(data = data, name = "weekday")
    
    data = [
        "Spanish",
        "German",
        "Hebrew"
    ]
    columns = pd.Index(data = data, name = "language")
    
    data = [
        ["lunes",     "Montag",     "יוֹם שֵׁנִי"],
        ["martes",    "Dienstag",   "יוֹם שְׁלִישִׁי"],
        ["miércoles", "Mittwoch",   "יוֹם רְבִיעִי"],
        ["jueves",    "Donnerstag", "יוֹם חֲמִישִׁי"],
        ["viernes",   "Freitag",    "יוֹם שִׁשִּׁי"],
        ["sábado",    "Samstag",    "יוֹם שַׁבָּת"],
        ["domingo",   "Sonntag",    "יוֹם רִאשׁוֹן"]
    ]
    df = pd.DataFrame(data = data, index = index, columns = columns)
    
    print(df)
    print()
    
    for colname in list(df):   #colname is a string
        print(colname)
    
    sys.exit(0)
    
    language     Spanish      German           Hebrew
    weekday
    Monday         lunes      Montag      יוֹם שֵׁנִי
    Tuesday       martes    Dienstag  יוֹם שְׁלִישִׁי
    Wednesday  miércoles    Mittwoch    יוֹם רְבִיעִי
    Thursday      jueves  Donnerstag   יוֹם חֲמִישִׁי
    Friday       viernes     Freitag    יוֹם שִׁשִּׁי
    Saturday      sábado     Samstag     יוֹם שַׁבָּת
    Sunday       domingo     Sonntag    יוֹם רִאשׁוֹן
    
    Spanish
    German
    Hebrew
    

    What do you get when you convert the pd.DataFrame to a Python dict?

    for weekday, series in dict(df).items():   #weekday is a string, series is a pd.Series
        print(weekday)
        print(series)
        print()
    
    Spanish
    weekday
    Monday           lunes
    Tuesday         martes
    Wednesday    miércoles
    Thursday        jueves
    Friday         viernes
    Saturday        sábado
    Sunday         domingo
    Name: Spanish, dtype: object
    
    German
    weekday
    Monday           Montag
    Tuesday        Dienstag
    Wednesday      Mittwoch
    Thursday     Donnerstag
    Friday          Freitag
    Saturday        Samstag
    Sunday          Sonntag
    Name: German, dtype: object
    
    Hebrew
    weekday
    Monday           יוֹם שֵׁנִי
    Tuesday      יוֹם שְׁלִישִׁי
    Wednesday      יוֹם רְבִיעִי
    Thursday      יוֹם חֲמִישִׁי
    Friday         יוֹם שִׁשִּׁי
    Saturday        יוֹם שַׁבָּת
    Sunday         יוֹם רִאשׁוֹן
    Name: Hebrew, dtype: object
    
  7. Plot the pd.DataFrame.
    "Plot a pd.DataFrame with matplotlib.pyplot."
    
    import pandas as pd
    import matplotlib.pyplot as plt
    
    data = [
        "New York",
        "Yonkers",
        "Pougheepsie",
        "Albany" 
    ]
    
    columns = pd.Index(data = data, name = "City")
    index = pd.RangeIndex(1, 7, name = "day of month")
    
    #Gets colder as you go north.
    #Gets warmer as the month progresses.
    
    data = [
        [ 6,  4,  2,  0],
        [16, 14, 12, 10],
        [26, 24, 22, 20],
        [36, 34, 32, 30],
        [46, 44, 42, 40],
        [56, 54, 52, 50]
    ]
    
    df = pd.DataFrame(data = data, index = index, columns = columns)
    print(df) 
    
    axes = df.plot(grid = True, figsize = [6.4, 4.8]) #width and height in inches
    figure = plt.gcf()
    figure.canvas.set_window_title("matplotlib.pyplot DataFrame.plot")
    axes.set_title("Daily Temperature in March")
    axes.set_ylabel("temperature in Fahrenheit")
    
    plt.show()   #infinite loop
    
    City          New York  Yonkers  Pougheepsie  Albany
    day of month                                        
    1                    6        4            2       0
    2                   16       14           12      10
    3                   26       24           22      20
    4                   36       34           32      30
    5                   46       44           42      40
    6                   56       54           52      50
    
  8. For Space Cadets only: examine the source code for class pd.DataFrame.