Aggregation with the NOAA Dataset

This example is from TDM 102 Project 8 Spring 2024.

These example(s) depend on the database:

  • /anvil/projects/tdm/data/whin/weather.parquet

Learn more about the dataset here.

1. Data Wrangling with Pandas

Read the file into a DataFrame called myDF.

(1a) Convert the observation_time column to into a datetime type.

#part a
import pandas as pd
import datetime
myDF = pd.read_parquet("/anvil/projects/tdm/data/whin/weather.parquet")
myDF['observation_time'] = pd.to_datetime(myDF['observation_time'])

(1b) Create 3 new columns for the year, month and day, based on the column observation_time.

# part b)
myDF['year'] = myDF['observation_time'].dt.year
myDF['month'] = myDF['observation_time'].dt.month
myDF['day'] = myDF['observation_time'].dt.day

(1c) For a given station_id, calculate the average month-and-year-pair temperatures (from the column temperature) for that station_id. Try this for a few different station_id values.

# part c
myDF['year'].unique()
avg_monthly_temp = myDF.groupby(['year','month'])['temperature'].mean()
print(avg_monthly_temp.head())

(1d) Now write a function called get_avg_temp that takes one station_id as input and returns the average month-and-year-pair temperatures (associated with that specific station_id). Make sure that the results of your function match with your work from question 1c.

def get_avg_temp(file, station_id):
    df = pd.read_parquet(file)
    df['observation_time'] = pd.to_datetime(df['observation_time'])
    df = df[df['station_id'] == station_id]
    #convert to datetime datatype and add year/month/day cols
    df['year'] = df['observation_time'].dt.year
    df['month'] = df['observation_time'].dt.month
    df['day'] = df['observation_time'].dt.day

    # calculate avg temp per month
    avg_monthly_temp = df.groupby(['year','month'])['temperature'].mean()

    return avg_monthly_temp

2. Data Visualization & Analysis

For this function, be sure to import matplotlib.pyplot.

We will use the function from question 1d to make some line plots.

(2a) For a given station_id, create a line plot, with one line for each year. Try this for a few different station_id values.

import matplotlib.pyplot as plt
# Try it for station_id = 1
avg_temp_by_month = get_avg_temp("/anvil/projects/tdm/data/whin/weather.parquet", 1)
avg_temp_by_month = avg_temp_by_month.to_frame().reset_index()

plt.figure(figsize=(10, 6))
for year in avg_temp_by_month['year'].unique():
    df_year = avg_temp_by_month[avg_temp_by_month['year'] == year]
    plt.plot(df_year['month'], df_year['temperature'], label=str(year), marker='o')

plt.title('Average Monthly Temperature for Station 1 Over the Years')
plt.xlabel('Month')
plt.ylabel('Average Temperature')
plt.legend(title='Year', loc='upper left')
plt.xticks(range(1, 13))
plt.grid(True)

plt.show()
import matplotlib.pyplot as plt
# Try it for station_id = 2
avg_temp_by_month = get_avg_temp("/anvil/projects/tdm/data/whin/weather.parquet", 1)
avg_temp_by_month = avg_temp_by_month.to_frame().reset_index()

plt.figure(figsize=(10, 6))
for year in avg_temp_by_month['year'].unique():
    df_year = avg_temp_by_month[avg_temp_by_month['year'] == year]
    plt.plot(df_year['month'], df_year['temperature'], label=str(year), marker='o')

plt.title('Average Monthly Temperature for Station 1 Over the Years')
plt.xlabel('Month')
plt.ylabel('Average Temperature')
plt.legend(title='Year', loc='upper left')
plt.xticks(range(1, 13))
plt.grid(True)

plt.show()

(2b) Now that you are sure your analysis from 2a works well, wrap your work from question 2a into a function that takes a station_id as input, and creates a line plot, with one line for each year (for the average month-and-year-pair temperatures from that station_id).

# part c
def get_avg_temp(file, station_id):
    df = pd.read_parquet(file)
    df['observation_time'] = pd.to_datetime(df['observation_time'])
    df = df[df['station_id'] == station_id]
    #convert to datetime datatype and add year/month/day cols
    df['year'] = df['observation_time'].dt.year
    df['month'] = df['observation_time'].dt.month
    df['day'] = df['observation_time'].dt.day

    # calculate avg temp per month
    avg_temp_by_month = df.groupby(['year','month'])['temperature'].mean()
    avg_temp_by_month = avg_temp_by_month.to_frame().reset_index()

    plt.figure(figsize=(10, 6))
    for year in avg_temp_by_month['year'].unique():
        df_year = avg_temp_by_month[avg_temp_by_month['year'] == year]
        plt.plot(df_year['month'], df_year['temperature'], label=str(year))
        plt.title('Average Monthly Temperature for Station 1 Over the Years')
        plt.xlabel('month')
        plt.ylabel('Average Temperature')
        plt.legend(title='Year', loc='best')
        plt.xticks(range(1, 13))
    plt.show()
avg_temp_by_month = get_avg_temp("/anvil/projects/tdm/data/whin/weather.parquet",1)

3. More Data Visualization & Analysis

(3a) Revisit the function from question 1d, to find the maximum temperature (instead of the average temperature) in each month-and-year-pair, for a given station. As before, you should test this for several examples before you build the function, and then make sure your function matches your examples.

# part a
df = pd.read_parquet("/anvil/projects/tdm/data/whin/weather.parquet")
df['observation_time'] = pd.to_datetime(df['observation_time'])
df = df[df['station_id'] == 1]
df['year'] = df['observation_time'].dt.year
df['month'] = df['observation_time'].dt.month
df['day'] = df['observation_time'].dt.day
df['year'].unique()
max_monthly_temp = df.groupby(['year','month'])['temperature'].max()
print(max_monthly_temp.head())

(3b) Revisit the function from question 2b, to make a function that takes one station_id as input and it creates a bar plot (instead of a line plot), depicting the maximum temperature in each month-and-year-pair (instead of the average temperature).

# part b
def get_max_temp(file, station_id):
    df = pd.read_parquet(file)
    df['observation_time'] = pd.to_datetime(df['observation_time'])
    df = df[df['station_id'] == station_id]
    #convert to datetime datatype and add year/month/day cols
    df['year'] = df['observation_time'].dt.year
    df['month'] = df['observation_time'].dt.month
    df['day'] = df['observation_time'].dt.day

    # calculate max temp per month
    avg_temp_by_month = df.groupby(['year','month'])['temperature'].max()
    avg_temp_by_month = avg_temp_by_month.to_frame().reset_index()

    plt.figure(figsize=(10, 6))
    for year in avg_temp_by_month['year'].unique():
        df_year = avg_temp_by_month[avg_temp_by_month['year'] == year]
        plt.plot(df_year['month'], df_year['temperature'], label=str(year))
        plt.title('Average Monthly Temperature for Station 1 Over the Years')
        plt.xlabel('month')
        plt.ylabel('Average Temperature')
        plt.legend(title='Year', loc='best')
        plt.xticks(range(1, 13))
    plt.show()

    return max_monthly_temp
max_temp_by_month = get_max_temp("/anvil/projects/tdm/data/whin/weather.parquet",1)

4. Data Visualization – Wind Speed Analysis

(4a) For a given station_id, create a box plot that shows the month-by-month wind speeds in 2020 for that specified station_id. Try this for a few different station_id values.

import pandas as pd
import matplotlib.pyplot as plt

file = "/anvil/projects/tdm/data/whin/weather.parquet"
# Part (a)
def plot_wind_2020(dataset, station_id):
    df = pd.read_parquet(dataset)
    station_df = df[df['station_id'] == station_id].copy()

    station_df['observation_time'] = pd.to_datetime(station_df['observation_time'])
    station_df['year'] = station_df['observation_time'].dt.year
    station_df['month'] = station_df['observation_time'].dt.month

    station_new_df = station_df[station_df['year'] == 2020]

    station_new_df.boxplot(column='wind_speed_mph', by='month', figsize=(10, 6))
    plt.title('Monthly Wind Speed Distribution for the Station in 2020')
    plt.suptitle('')
    plt.xlabel('Month')
    plt.ylabel('Wind Speed (mph)')
    plt.show()
    return station_new_df

(4b) Write a function that takes a year (not necessarily 2020) and a station_id as inputs, and the function creates a box plot about the month-by-month wind speeds in that specific year (not necessarily 2020), at the specified station_id.

# Part (b)
def plot_wind_dist(dataset, station_id, year):
    df = pd.read_parquet(dataset)
    station_df = df[df['station_id'] == station_id].copy()

    station_df['observation_time'] = pd.to_datetime(station_df['observation_time'])
    station_df['year'] = station_df['observation_time'].dt.year
    station_df['month'] = station_df['observation_time'].dt.month

    station_new_df = station_df[station_df['year'] == year]

    station_new_df.boxplot(column='wind_speed_mph', by='month', figsize=(10, 6))
    plt.title('Monthly Wind Speed Distribution for the Station in {}'.format(year))
    plt.suptitle('')
    plt.xlabel('Month')
    plt.ylabel('Wind Speed (mph)')
    plt.show()
    return station_new_df

wind_df = plot_wind_2020(file, 1)
wind_df = plot_wind_dist(file, 1, 2019)