Year of Birth
DataFrame
object
"""
Sort a DataFrame in order of decreasing length of name.
Break ties by sortting in alphabetical order.
"""
import sys
import numpy as np
import pandas as pd
url = "http://oit2.scps.nyu.edu/~meretzkm/pandas/dataframe/testyob2018.txt"
names = ["name", "sex", "births"]
def converter(births):
try:
i = np.float64(births)
except:
return np.nan
else:
return i
converters = {
"births": converter
}
df = pd.read_csv(url, names = names)#, converters = converters)
print(df.dtypes)
print()
print(df)
print()
sys.exit(0)
seriesOfBools = ~df["name"].str.match(r"^[A-Z][a-z]{1,14}$")
if seriesOfBools.any():
print(f"The DataFrame contains {seriesOfBools.sum()} row(s) with a bad name.")
print(df[seriesOfBools])
print()
seriesOfBools = ~df["sex"].isin(["M", "F"])
if seriesOfBools.any():
print(f"The DataFrame contains {seriesOfBools.sum()} row(s) with a bad sex.")
print(df[seriesOfBools])
print()
seriesOfBools = df["births"].isnull()
if seriesOfBools.any():
print(f"The DataFrame contains {seriesOfBools.sum()} row(s) with a bad number of births.")
print(df[seriesOfBools])
print()
df = df[df["name"].str.match(r"^[A-Z][a-z]{1,14}$")] #Keep only the rows with a valid name.
df = df[df["sex"].isin(["M", "F"])] #Keep only the rows with a valid sex.
df.dropna(inplace = True) #Keep only the rows with a valid number of births.
print(df)
print(f"{sys.getsizeof(df) = :,}")
df.replace({"sex": {"M": True, "F": False}}, inplace = True)
print(f"{sys.getsizeof(df) = :,}")
df["births"] = df["births"].astype(np.int16)
print(f"{sys.getsizeof(df) = :,}")
print()
print(f'Longest name is {df["name"].str.len().max()} characters.')
#df.sort_values(by = ["name"], ascending = [True], inplace = True)
#with pd.option_context("display.min_rows", 60):
formatters = {"sex": lambda b: "M" if b else "F"}
s = df.to_string(formatters = formatters, max_rows = 10, show_dimensions = True)
print(s)
print()
sys.exit(0)