Spaces:
Sleeping
Sleeping
atodorov284
commited on
Commit
·
d2d624a
1
Parent(s):
6015ff3
added extra materials and update README
Browse files- README.md +9 -1
- air-quality-forecast/data_pipeline.py +130 -52
- air-quality-forecast/main.py +1 -2
- air-quality-forecast/utils.py +54 -21
- data/interim/correlation_matrix.csv +30 -0
- extra_scripts/corr_map.R +34 -0
- extra_scripts/histograms.tex +167 -0
README.md
CHANGED
|
@@ -4,7 +4,15 @@
|
|
| 4 |
<img src="https://img.shields.io/badge/CCDS-Project%20template-328F97?logo=cookiecutter" />
|
| 5 |
</a>
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
## Project Organization
|
| 10 |
|
|
|
|
| 4 |
<img src="https://img.shields.io/badge/CCDS-Project%20template-328F97?logo=cookiecutter" />
|
| 5 |
</a>
|
| 6 |
|
| 7 |
+
Air pollution is a significant environmental concern, especially in urban areas, where the high levels of nitrogen dioxide and ozone can have a negative impact on human health, the ecosystem and on the overall quality of life. Given these risks, monitoring and forecasting the level of air pollution is an important task in order to allow for timely actions to reduce the harmful effects.
|
| 8 |
+
|
| 9 |
+
In the Netherlands, cities like Utrecht experience challenges concerning air quality due to urbanization, transportation, and industrial activities. Developing a system that can provide accurate and robust real-time air quality monitoring and reliable forecasts for future pollution levels would allow authorities and residents to take preventive measures and adjust their future activities based on expected air quality. This project focuses on the time-series forecasting of air pollution levels, specifically NO$_2$ and O$_3$ concentrations, for the next three days. This task can be framed as a regression problem, where the goal is to predict continuous values based on historical environmental data. Moreover, it provides infrastructure for real-time prediction, based on recent measurements.
|
| 10 |
+
|
| 11 |
+
## How To Run This Code
|
| 12 |
+
|
| 13 |
+
Currently, this repository is at the data engineering stage. To run the data pipeline, run main.py under air-quality-forecast, which contains the source code of this project. The processed and split datasets can be found under data/processed, namely x_train, x_val, x_test, y_train, y_val, y_test.
|
| 14 |
+
|
| 15 |
+
The notebooks in this project were used as scratch for analysis and data merge and do not reflect our thorough methodology (source is under air-quality-forecast). Some extra scripts for the generation of our plots in the report can be found under extra_scripts.
|
| 16 |
|
| 17 |
## Project Organization
|
| 18 |
|
air-quality-forecast/data_pipeline.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import os
|
| 3 |
from utils import FeatureSelector, InputValidator
|
| 4 |
-
from sklearn.preprocessing import MinMaxScaler
|
| 5 |
from sklearn.model_selection import train_test_split
|
| 6 |
from typing import Tuple
|
| 7 |
import numpy as np
|
| 8 |
|
|
|
|
| 9 |
class DataLoader:
|
| 10 |
def __init__(self, raw_data_path: str, processed_data_path: str) -> None:
|
| 11 |
"""
|
|
@@ -30,8 +31,14 @@ class DataLoader:
|
|
| 30 |
"""
|
| 31 |
InputValidator.validate_file_exists(self.raw_data_path, "raw_data_path")
|
| 32 |
|
| 33 |
-
self.raw_griftpark_data = pd.read_csv(
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
return self.raw_griftpark_data, self.raw_utrecht_data
|
| 36 |
|
| 37 |
def save_to_csv(self, name: str, data: pd.DataFrame) -> None:
|
|
@@ -42,16 +49,18 @@ class DataLoader:
|
|
| 42 |
:param data: The Pandas DataFrame to save as a CSV.
|
| 43 |
"""
|
| 44 |
InputValidator.validate_type(name, str, "name")
|
| 45 |
-
|
| 46 |
# If the data is a numpy array, convert it to a Pandas DataFrame
|
| 47 |
if isinstance(data, np.ndarray):
|
| 48 |
data = pd.DataFrame(data)
|
| 49 |
-
|
| 50 |
data.to_csv(os.path.join(self.processed_data_path, name))
|
| 51 |
|
| 52 |
|
| 53 |
class FeatureProcessor:
|
| 54 |
-
def __init__(
|
|
|
|
|
|
|
| 55 |
"""
|
| 56 |
Initializes the FeatureProcessor with Griftpark and Utrecht data.
|
| 57 |
|
|
@@ -60,7 +69,7 @@ class FeatureProcessor:
|
|
| 60 |
"""
|
| 61 |
InputValidator.validate_type(griftpark_data, pd.DataFrame, "griftpark_data")
|
| 62 |
InputValidator.validate_type(utrecht_data, pd.DataFrame, "utrecht_data")
|
| 63 |
-
|
| 64 |
self.griftpark_data = griftpark_data
|
| 65 |
self.utrecht_data = utrecht_data
|
| 66 |
self.merged_data = None
|
|
@@ -71,8 +80,12 @@ class FeatureProcessor:
|
|
| 71 |
|
| 72 |
:return: The merged Pandas DataFrame.
|
| 73 |
"""
|
| 74 |
-
self.utrecht_data[
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
return self.merged_data
|
| 77 |
|
| 78 |
def sort_data_by_date(self) -> pd.DataFrame:
|
|
@@ -83,8 +96,10 @@ class FeatureProcessor:
|
|
| 83 |
"""
|
| 84 |
if self.merged_data is None:
|
| 85 |
raise ValueError("Merged data not available. Please merge data first.")
|
| 86 |
-
self.merged_data[
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
return self.merged_data
|
| 89 |
|
| 90 |
def select_features(self) -> pd.DataFrame:
|
|
@@ -95,20 +110,20 @@ class FeatureProcessor:
|
|
| 95 |
"""
|
| 96 |
if self.merged_data is None:
|
| 97 |
raise ValueError("Merged data not available. Please merge data first.")
|
| 98 |
-
|
| 99 |
InputValidator.validate_type(self.merged_data, pd.DataFrame, "merged_data")
|
| 100 |
-
|
| 101 |
# Feature selection logic
|
| 102 |
cols_to_drop = FeatureSelector.uninformative_columns()
|
| 103 |
-
self.merged_data.drop(cols_to_drop, axis=1, inplace=True, errors=
|
| 104 |
self.merged_data = FeatureSelector.rename_initial_columns(self.merged_data)
|
| 105 |
self.merged_data = FeatureSelector.change_to_numeric(self.merged_data)
|
| 106 |
-
|
| 107 |
selected_columns = FeatureSelector.select_cols_by_correlation(self.merged_data)
|
| 108 |
-
domain_knowledge_columns = [
|
| 109 |
-
selected_columns = [
|
| 110 |
self.merged_data = self.merged_data[selected_columns]
|
| 111 |
-
|
| 112 |
return self.merged_data
|
| 113 |
|
| 114 |
def apply_time_shift(self, t_max: int = 3) -> pd.DataFrame:
|
|
@@ -120,20 +135,26 @@ class FeatureProcessor:
|
|
| 120 |
"""
|
| 121 |
if self.merged_data is None:
|
| 122 |
raise ValueError("Data not available. Please process data first.")
|
| 123 |
-
|
| 124 |
InputValidator.validate_type(t_max, int, "t_max")
|
| 125 |
-
|
| 126 |
# Time shifting logic
|
| 127 |
all_cols = self.merged_data.columns
|
| 128 |
for t in range(1, t_max + 1):
|
| 129 |
for col in all_cols:
|
| 130 |
-
self.merged_data[[f
|
| 131 |
-
|
|
|
|
|
|
|
| 132 |
for t in range(1, 3):
|
| 133 |
-
for col in [
|
| 134 |
-
self.merged_data[[f
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
return self.merged_data
|
| 138 |
|
| 139 |
def preprocess_data(self) -> pd.DataFrame:
|
|
@@ -143,13 +164,36 @@ class FeatureProcessor:
|
|
| 143 |
:return: The preprocessed Pandas DataFrame.
|
| 144 |
"""
|
| 145 |
self.select_features()
|
| 146 |
-
self.merged_data.set_index(
|
| 147 |
-
self.merged_data.dropna(subset=[
|
| 148 |
self.apply_time_shift()
|
| 149 |
|
| 150 |
# Drop unnecessary columns
|
| 151 |
-
self.merged_data.drop(
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
self.preprocessed_data = self.merged_data
|
| 155 |
return self.preprocessed_data
|
|
@@ -172,14 +216,28 @@ class PreprocessingPipeline:
|
|
| 172 |
Initializes the PreprocessingPipeline with paths to raw and processed data directories.
|
| 173 |
"""
|
| 174 |
project_root = os.path.dirname(os.path.dirname(__file__))
|
| 175 |
-
raw_data_path = os.path.join(project_root,
|
| 176 |
-
processed_data_path = os.path.join(project_root,
|
| 177 |
|
| 178 |
self.data_loader = DataLoader(raw_data_path, processed_data_path)
|
| 179 |
self.feature_processor = None
|
| 180 |
self.normalizer = MinMaxScaler()
|
| 181 |
-
|
| 182 |
-
def train_test_validation_split(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
"""
|
| 184 |
Split the data into training and testing sets.
|
| 185 |
|
|
@@ -190,12 +248,21 @@ class PreprocessingPipeline:
|
|
| 190 |
InputValidator.validate_type(y, pd.DataFrame, "data")
|
| 191 |
InputValidator.validate_type(test_size_, float, "test_size")
|
| 192 |
InputValidator.validate_type(validation_size, float, "validation_size")
|
| 193 |
-
test_val_proportion = validation_size / (
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
return x_train, x_test, x_val, y_train, y_test, y_val
|
| 198 |
-
|
| 199 |
def run_pipeline(self) -> pd.DataFrame:
|
| 200 |
"""
|
| 201 |
Run the entire preprocessing pipeline: load data, process features, normalize, and save to CSV.
|
|
@@ -203,7 +270,7 @@ class PreprocessingPipeline:
|
|
| 203 |
:param normalizer_type: The type of normalizer to use.
|
| 204 |
:return: The final normalized Pandas DataFrame.
|
| 205 |
"""
|
| 206 |
-
|
| 207 |
# Step 1: Load raw data
|
| 208 |
griftpark_data, utrecht_data = self.data_loader()
|
| 209 |
|
|
@@ -212,33 +279,44 @@ class PreprocessingPipeline:
|
|
| 212 |
preprocessed_data = self.feature_processor()
|
| 213 |
|
| 214 |
# Step 3: Save processed data
|
| 215 |
-
self.data_loader.save_to_csv(
|
|
|
|
|
|
|
| 216 |
|
| 217 |
# Step 4: Split data into train, test, and validation sets
|
| 218 |
-
columns_to_predict = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
x = preprocessed_data.drop(columns_to_predict, axis=1)
|
| 220 |
y = preprocessed_data[columns_to_predict]
|
| 221 |
-
x_train, x_test, x_val, y_train, y_test, y_val =
|
|
|
|
|
|
|
| 222 |
|
| 223 |
# Step 5: Normalize data for 3 sets (x_train, x_test, x_val)
|
| 224 |
-
x_train[x_train.columns] = self.normalizer.fit_transform(
|
|
|
|
|
|
|
| 225 |
x_test[x_test.columns] = self.normalizer.transform(x_test[x_test.columns])
|
| 226 |
x_val[x_val.columns] = self.normalizer.transform(x_val[x_val.columns])
|
| 227 |
|
| 228 |
# Convert the normalized NumPy array back to a DataFrame
|
| 229 |
# normalized_x_train = pd.DataFrame(x_train, columns=preprocessed_data.columns, index=preprocessed_data.index)
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
self.data_loader.save_to_csv(
|
| 234 |
-
self.data_loader.save_to_csv(
|
| 235 |
-
self.data_loader.save_to_csv(
|
| 236 |
-
self.data_loader.save_to_csv(
|
| 237 |
-
self.data_loader.save_to_csv(
|
| 238 |
-
self.data_loader.save_to_csv('y_val.csv', y_val)
|
| 239 |
|
| 240 |
# Convert the normalized NumPy array back to a DataFrame
|
| 241 |
# normalized_df = pd.DataFrame(normalized_data, columns=preprocessed_data.columns, index=preprocessed_data.index)
|
| 242 |
|
| 243 |
return preprocessed_data
|
| 244 |
-
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import os
|
| 3 |
from utils import FeatureSelector, InputValidator
|
| 4 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 5 |
from sklearn.model_selection import train_test_split
|
| 6 |
from typing import Tuple
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
+
|
| 10 |
class DataLoader:
|
| 11 |
def __init__(self, raw_data_path: str, processed_data_path: str) -> None:
|
| 12 |
"""
|
|
|
|
| 31 |
"""
|
| 32 |
InputValidator.validate_file_exists(self.raw_data_path, "raw_data_path")
|
| 33 |
|
| 34 |
+
self.raw_griftpark_data = pd.read_csv(
|
| 35 |
+
os.path.join(
|
| 36 |
+
self.raw_data_path, "v1_raw_griftpark,-utrecht-air-quality.csv"
|
| 37 |
+
)
|
| 38 |
+
)
|
| 39 |
+
self.raw_utrecht_data = pd.read_csv(
|
| 40 |
+
os.path.join(self.raw_data_path, "v1_utrecht 2014-01-29 to 2024-09-11.csv")
|
| 41 |
+
)
|
| 42 |
return self.raw_griftpark_data, self.raw_utrecht_data
|
| 43 |
|
| 44 |
def save_to_csv(self, name: str, data: pd.DataFrame) -> None:
|
|
|
|
| 49 |
:param data: The Pandas DataFrame to save as a CSV.
|
| 50 |
"""
|
| 51 |
InputValidator.validate_type(name, str, "name")
|
| 52 |
+
|
| 53 |
# If the data is a numpy array, convert it to a Pandas DataFrame
|
| 54 |
if isinstance(data, np.ndarray):
|
| 55 |
data = pd.DataFrame(data)
|
| 56 |
+
|
| 57 |
data.to_csv(os.path.join(self.processed_data_path, name))
|
| 58 |
|
| 59 |
|
| 60 |
class FeatureProcessor:
|
| 61 |
+
def __init__(
|
| 62 |
+
self, griftpark_data: pd.DataFrame, utrecht_data: pd.DataFrame
|
| 63 |
+
) -> None:
|
| 64 |
"""
|
| 65 |
Initializes the FeatureProcessor with Griftpark and Utrecht data.
|
| 66 |
|
|
|
|
| 69 |
"""
|
| 70 |
InputValidator.validate_type(griftpark_data, pd.DataFrame, "griftpark_data")
|
| 71 |
InputValidator.validate_type(utrecht_data, pd.DataFrame, "utrecht_data")
|
| 72 |
+
|
| 73 |
self.griftpark_data = griftpark_data
|
| 74 |
self.utrecht_data = utrecht_data
|
| 75 |
self.merged_data = None
|
|
|
|
| 80 |
|
| 81 |
:return: The merged Pandas DataFrame.
|
| 82 |
"""
|
| 83 |
+
self.utrecht_data["datetime"] = pd.to_datetime(
|
| 84 |
+
self.utrecht_data["datetime"], format="%Y-%m-%d"
|
| 85 |
+
).dt.strftime("%d/%m/%Y")
|
| 86 |
+
self.merged_data = pd.merge(
|
| 87 |
+
self.griftpark_data, self.utrecht_data, left_on="date", right_on="datetime"
|
| 88 |
+
)
|
| 89 |
return self.merged_data
|
| 90 |
|
| 91 |
def sort_data_by_date(self) -> pd.DataFrame:
|
|
|
|
| 96 |
"""
|
| 97 |
if self.merged_data is None:
|
| 98 |
raise ValueError("Merged data not available. Please merge data first.")
|
| 99 |
+
self.merged_data["datetime"] = pd.to_datetime(
|
| 100 |
+
self.merged_data["datetime"], format="%d/%m/%Y"
|
| 101 |
+
)
|
| 102 |
+
self.merged_data.sort_values(by="datetime", ascending=False, inplace=True)
|
| 103 |
return self.merged_data
|
| 104 |
|
| 105 |
def select_features(self) -> pd.DataFrame:
|
|
|
|
| 110 |
"""
|
| 111 |
if self.merged_data is None:
|
| 112 |
raise ValueError("Merged data not available. Please merge data first.")
|
| 113 |
+
|
| 114 |
InputValidator.validate_type(self.merged_data, pd.DataFrame, "merged_data")
|
| 115 |
+
|
| 116 |
# Feature selection logic
|
| 117 |
cols_to_drop = FeatureSelector.uninformative_columns()
|
| 118 |
+
self.merged_data.drop(cols_to_drop, axis=1, inplace=True, errors="ignore")
|
| 119 |
self.merged_data = FeatureSelector.rename_initial_columns(self.merged_data)
|
| 120 |
self.merged_data = FeatureSelector.change_to_numeric(self.merged_data)
|
| 121 |
+
|
| 122 |
selected_columns = FeatureSelector.select_cols_by_correlation(self.merged_data)
|
| 123 |
+
domain_knowledge_columns = ["precip", "windspeed", "winddir"]
|
| 124 |
+
selected_columns = ["date"] + selected_columns + domain_knowledge_columns
|
| 125 |
self.merged_data = self.merged_data[selected_columns]
|
| 126 |
+
|
| 127 |
return self.merged_data
|
| 128 |
|
| 129 |
def apply_time_shift(self, t_max: int = 3) -> pd.DataFrame:
|
|
|
|
| 135 |
"""
|
| 136 |
if self.merged_data is None:
|
| 137 |
raise ValueError("Data not available. Please process data first.")
|
| 138 |
+
|
| 139 |
InputValidator.validate_type(t_max, int, "t_max")
|
| 140 |
+
|
| 141 |
# Time shifting logic
|
| 142 |
all_cols = self.merged_data.columns
|
| 143 |
for t in range(1, t_max + 1):
|
| 144 |
for col in all_cols:
|
| 145 |
+
self.merged_data[[f"{col} - day {t}"]] = self.merged_data[[col]].shift(
|
| 146 |
+
-t
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
for t in range(1, 3):
|
| 150 |
+
for col in ["o3", "no2"]:
|
| 151 |
+
self.merged_data[[f"{col} + day {t}"]] = self.merged_data[[col]].shift(
|
| 152 |
+
t
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
self.merged_data[self.merged_data.columns] = self.merged_data[
|
| 156 |
+
self.merged_data.columns
|
| 157 |
+
].apply(pd.to_numeric)
|
| 158 |
return self.merged_data
|
| 159 |
|
| 160 |
def preprocess_data(self) -> pd.DataFrame:
|
|
|
|
| 164 |
:return: The preprocessed Pandas DataFrame.
|
| 165 |
"""
|
| 166 |
self.select_features()
|
| 167 |
+
self.merged_data.set_index("date", inplace=True)
|
| 168 |
+
self.merged_data.dropna(subset=["o3", "no2"])
|
| 169 |
self.apply_time_shift()
|
| 170 |
|
| 171 |
# Drop unnecessary columns
|
| 172 |
+
self.merged_data.drop(
|
| 173 |
+
[
|
| 174 |
+
"pm25",
|
| 175 |
+
"pm10",
|
| 176 |
+
"temp",
|
| 177 |
+
"humidity",
|
| 178 |
+
"visibility",
|
| 179 |
+
"solarradiation",
|
| 180 |
+
"precip",
|
| 181 |
+
"windspeed",
|
| 182 |
+
"winddir",
|
| 183 |
+
],
|
| 184 |
+
axis=1,
|
| 185 |
+
inplace=True,
|
| 186 |
+
)
|
| 187 |
+
self.merged_data.drop(
|
| 188 |
+
index=[
|
| 189 |
+
"29/01/2014",
|
| 190 |
+
"30/01/2014",
|
| 191 |
+
"31/01/2014",
|
| 192 |
+
"10/09/2024",
|
| 193 |
+
"11/09/2024",
|
| 194 |
+
],
|
| 195 |
+
inplace=True,
|
| 196 |
+
)
|
| 197 |
|
| 198 |
self.preprocessed_data = self.merged_data
|
| 199 |
return self.preprocessed_data
|
|
|
|
| 216 |
Initializes the PreprocessingPipeline with paths to raw and processed data directories.
|
| 217 |
"""
|
| 218 |
project_root = os.path.dirname(os.path.dirname(__file__))
|
| 219 |
+
raw_data_path = os.path.join(project_root, "data", "raw")
|
| 220 |
+
processed_data_path = os.path.join(project_root, "data", "processed")
|
| 221 |
|
| 222 |
self.data_loader = DataLoader(raw_data_path, processed_data_path)
|
| 223 |
self.feature_processor = None
|
| 224 |
self.normalizer = MinMaxScaler()
|
| 225 |
+
|
| 226 |
+
def train_test_validation_split(
|
| 227 |
+
self,
|
| 228 |
+
x: pd.DataFrame,
|
| 229 |
+
y: pd.DataFrame,
|
| 230 |
+
test_size_: float = 0.15,
|
| 231 |
+
validation_size: float = 0.15,
|
| 232 |
+
random_state_=4242,
|
| 233 |
+
) -> Tuple[
|
| 234 |
+
pd.DataFrame,
|
| 235 |
+
pd.DataFrame,
|
| 236 |
+
pd.DataFrame,
|
| 237 |
+
pd.DataFrame,
|
| 238 |
+
pd.DataFrame,
|
| 239 |
+
pd.DataFrame,
|
| 240 |
+
]:
|
| 241 |
"""
|
| 242 |
Split the data into training and testing sets.
|
| 243 |
|
|
|
|
| 248 |
InputValidator.validate_type(y, pd.DataFrame, "data")
|
| 249 |
InputValidator.validate_type(test_size_, float, "test_size")
|
| 250 |
InputValidator.validate_type(validation_size, float, "validation_size")
|
| 251 |
+
test_val_proportion = validation_size / (
|
| 252 |
+
test_size_ + validation_size
|
| 253 |
+
) # Proportion of test to validation_size
|
| 254 |
+
x_train, x_test_val, y_train, y_test_val = train_test_split(
|
| 255 |
+
x, y, test_size=(test_size_ + validation_size), random_state=random_state_
|
| 256 |
+
)
|
| 257 |
+
x_test, x_val, y_test, y_val = train_test_split(
|
| 258 |
+
x_test_val,
|
| 259 |
+
y_test_val,
|
| 260 |
+
test_size=test_val_proportion,
|
| 261 |
+
random_state=random_state_,
|
| 262 |
+
)
|
| 263 |
|
| 264 |
return x_train, x_test, x_val, y_train, y_test, y_val
|
| 265 |
+
|
| 266 |
def run_pipeline(self) -> pd.DataFrame:
|
| 267 |
"""
|
| 268 |
Run the entire preprocessing pipeline: load data, process features, normalize, and save to CSV.
|
|
|
|
| 270 |
:param normalizer_type: The type of normalizer to use.
|
| 271 |
:return: The final normalized Pandas DataFrame.
|
| 272 |
"""
|
| 273 |
+
|
| 274 |
# Step 1: Load raw data
|
| 275 |
griftpark_data, utrecht_data = self.data_loader()
|
| 276 |
|
|
|
|
| 279 |
preprocessed_data = self.feature_processor()
|
| 280 |
|
| 281 |
# Step 3: Save processed data
|
| 282 |
+
self.data_loader.save_to_csv(
|
| 283 |
+
"v3_lagged_no_missing_predicted_data.csv", preprocessed_data
|
| 284 |
+
)
|
| 285 |
|
| 286 |
# Step 4: Split data into train, test, and validation sets
|
| 287 |
+
columns_to_predict = [
|
| 288 |
+
"no2",
|
| 289 |
+
"o3",
|
| 290 |
+
"no2 + day 1",
|
| 291 |
+
"o3 + day 1",
|
| 292 |
+
"no2 + day 2",
|
| 293 |
+
"o3 + day 2",
|
| 294 |
+
]
|
| 295 |
x = preprocessed_data.drop(columns_to_predict, axis=1)
|
| 296 |
y = preprocessed_data[columns_to_predict]
|
| 297 |
+
x_train, x_test, x_val, y_train, y_test, y_val = (
|
| 298 |
+
self.train_test_validation_split(x, y)
|
| 299 |
+
)
|
| 300 |
|
| 301 |
# Step 5: Normalize data for 3 sets (x_train, x_test, x_val)
|
| 302 |
+
x_train[x_train.columns] = self.normalizer.fit_transform(
|
| 303 |
+
x_train[x_train.columns]
|
| 304 |
+
)
|
| 305 |
x_test[x_test.columns] = self.normalizer.transform(x_test[x_test.columns])
|
| 306 |
x_val[x_val.columns] = self.normalizer.transform(x_val[x_val.columns])
|
| 307 |
|
| 308 |
# Convert the normalized NumPy array back to a DataFrame
|
| 309 |
# normalized_x_train = pd.DataFrame(x_train, columns=preprocessed_data.columns, index=preprocessed_data.index)
|
| 310 |
|
| 311 |
+
# Step 6: Save normalized data
|
| 312 |
+
self.data_loader.save_to_csv("x_train.csv", x_train)
|
| 313 |
+
self.data_loader.save_to_csv("x_test.csv", x_test)
|
| 314 |
+
self.data_loader.save_to_csv("x_val.csv", x_val)
|
| 315 |
+
self.data_loader.save_to_csv("y_train.csv", y_train)
|
| 316 |
+
self.data_loader.save_to_csv("y_test.csv", y_test)
|
| 317 |
+
self.data_loader.save_to_csv("y_val.csv", y_val)
|
|
|
|
| 318 |
|
| 319 |
# Convert the normalized NumPy array back to a DataFrame
|
| 320 |
# normalized_df = pd.DataFrame(normalized_data, columns=preprocessed_data.columns, index=preprocessed_data.index)
|
| 321 |
|
| 322 |
return preprocessed_data
|
|
|
air-quality-forecast/main.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from data_pipeline import PreprocessingPipeline
|
| 2 |
|
| 3 |
-
|
| 4 |
if __name__ == "__main__":
|
| 5 |
pipeline = PreprocessingPipeline()
|
| 6 |
-
pipeline.run_pipeline()
|
|
|
|
| 1 |
from data_pipeline import PreprocessingPipeline
|
| 2 |
|
|
|
|
| 3 |
if __name__ == "__main__":
|
| 4 |
pipeline = PreprocessingPipeline()
|
| 5 |
+
pipeline.run_pipeline()
|
air-quality-forecast/utils.py
CHANGED
|
@@ -5,20 +5,23 @@ import os
|
|
| 5 |
File with utilities
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
class InputValidator:
|
| 9 |
@staticmethod
|
| 10 |
def validate_type(value, expected_type, variable_name: str) -> None:
|
| 11 |
"""
|
| 12 |
Validate the type of the given variable.
|
| 13 |
-
|
| 14 |
:param value: The value to validate.
|
| 15 |
:param expected_type: The expected type of the value.
|
| 16 |
:param variable_name: The name of the variable for error messages.
|
| 17 |
:raises TypeError: If the value is not of the expected type.
|
| 18 |
"""
|
| 19 |
if not isinstance(value, expected_type):
|
| 20 |
-
raise TypeError(
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
@staticmethod
|
| 23 |
def validate_file_exists(path: str, variable_name: str) -> None:
|
| 24 |
"""
|
|
@@ -31,33 +34,63 @@ class InputValidator:
|
|
| 31 |
if not os.path.exists(path):
|
| 32 |
raise FileNotFoundError(f"{variable_name} path {path} does not exist.")
|
| 33 |
|
|
|
|
| 34 |
class FeatureSelector:
|
| 35 |
def uninformative_columns() -> list:
|
| 36 |
-
"""
|
| 37 |
-
return [
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def rename_initial_columns(data):
|
| 40 |
-
"""
|
| 41 |
-
data = data.rename(
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def change_to_numeric(data):
|
| 45 |
-
"""
|
| 46 |
-
data.loc[:, data.columns !=
|
|
|
|
|
|
|
| 47 |
return data
|
| 48 |
-
|
| 49 |
def select_cols_by_correlation(data) -> list:
|
| 50 |
-
"""
|
| 51 |
-
#Step 1: Calculate correlations between features and O3/NO2
|
| 52 |
-
corr_no2 = abs(data.loc[:, data.columns !=
|
| 53 |
-
corr_o3 = abs(data.loc[:, data.columns !=
|
| 54 |
|
| 55 |
-
#Step 2: Remove the columns not correlated with any of the labels
|
| 56 |
columns_above_threshold = (corr_no2 > 0.3) | (corr_o3 > 0.3)
|
| 57 |
selected_columns = columns_above_threshold[columns_above_threshold].index
|
| 58 |
|
| 59 |
-
#Step 3: Remove the columns with high correlations with each other (chosen by manual inspection of the correlation matrix)
|
| 60 |
-
to_remove = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
selected_columns = [item for item in selected_columns if item not in to_remove]
|
| 62 |
return selected_columns
|
| 63 |
-
|
|
|
|
| 5 |
File with utilities
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
|
| 9 |
class InputValidator:
|
| 10 |
@staticmethod
|
| 11 |
def validate_type(value, expected_type, variable_name: str) -> None:
|
| 12 |
"""
|
| 13 |
Validate the type of the given variable.
|
| 14 |
+
|
| 15 |
:param value: The value to validate.
|
| 16 |
:param expected_type: The expected type of the value.
|
| 17 |
:param variable_name: The name of the variable for error messages.
|
| 18 |
:raises TypeError: If the value is not of the expected type.
|
| 19 |
"""
|
| 20 |
if not isinstance(value, expected_type):
|
| 21 |
+
raise TypeError(
|
| 22 |
+
f"{variable_name} must be of type {expected_type.__name__}."
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
@staticmethod
|
| 26 |
def validate_file_exists(path: str, variable_name: str) -> None:
|
| 27 |
"""
|
|
|
|
| 34 |
if not os.path.exists(path):
|
| 35 |
raise FileNotFoundError(f"{variable_name} path {path} does not exist.")
|
| 36 |
|
| 37 |
+
|
| 38 |
class FeatureSelector:
|
| 39 |
def uninformative_columns() -> list:
|
| 40 |
+
"""Those columns provide no information that the model can use"""
|
| 41 |
+
return [
|
| 42 |
+
"Unnamed: 0",
|
| 43 |
+
"name",
|
| 44 |
+
"datetime",
|
| 45 |
+
"sunrise",
|
| 46 |
+
"sunset",
|
| 47 |
+
"preciptype",
|
| 48 |
+
"conditions",
|
| 49 |
+
"description",
|
| 50 |
+
"icon",
|
| 51 |
+
"stations",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
def rename_initial_columns(data):
|
| 55 |
+
"""Rename the columns of the datasets to remove whitespaces."""
|
| 56 |
+
data = data.rename(
|
| 57 |
+
columns={
|
| 58 |
+
" pm25": "pm25",
|
| 59 |
+
" pm10": "pm10",
|
| 60 |
+
" o3": "o3",
|
| 61 |
+
" no2": "no2",
|
| 62 |
+
" so2": "so2",
|
| 63 |
+
}
|
| 64 |
+
)
|
| 65 |
+
return data
|
| 66 |
+
|
| 67 |
def change_to_numeric(data):
|
| 68 |
+
"""Change each entry to a numerical value."""
|
| 69 |
+
data.loc[:, data.columns != "date"] = data.loc[:, data.columns != "date"].apply(
|
| 70 |
+
pd.to_numeric, errors="coerce"
|
| 71 |
+
)
|
| 72 |
return data
|
| 73 |
+
|
| 74 |
def select_cols_by_correlation(data) -> list:
|
| 75 |
+
"""Select columns based on correlation criteria."""
|
| 76 |
+
# Step 1: Calculate correlations between features and O3/NO2
|
| 77 |
+
corr_no2 = abs(data.loc[:, data.columns != "date"].corr()["no2"])
|
| 78 |
+
corr_o3 = abs(data.loc[:, data.columns != "date"].corr()["o3"])
|
| 79 |
|
| 80 |
+
# Step 2: Remove the columns not correlated with any of the labels
|
| 81 |
columns_above_threshold = (corr_no2 > 0.3) | (corr_o3 > 0.3)
|
| 82 |
selected_columns = columns_above_threshold[columns_above_threshold].index
|
| 83 |
|
| 84 |
+
# Step 3: Remove the columns with high correlations with each other (chosen by manual inspection of the correlation matrix)
|
| 85 |
+
to_remove = [
|
| 86 |
+
"feelslikemax",
|
| 87 |
+
"feelslikemin",
|
| 88 |
+
"feelslike",
|
| 89 |
+
"tempmin",
|
| 90 |
+
"tempmax",
|
| 91 |
+
"dew",
|
| 92 |
+
"solarenergy",
|
| 93 |
+
"uvindex",
|
| 94 |
+
]
|
| 95 |
selected_columns = [item for item in selected_columns if item not in to_remove]
|
| 96 |
return selected_columns
|
|
|
data/interim/correlation_matrix.csv
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
,pm25,pm10,o3,no2,so2,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipprob,precipcover,snow,snowdepth,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,moonphase
|
| 2 |
+
pm25,1,0.602,-0.239,0.397,0.044,-0.293,-0.434,-0.38,-0.293,-0.419,-0.374,-0.361,0.177,-0.224,-0.24,-0.241,0.103,0.099,-0.333,-0.243,-0.225,0.274,-0.106,-0.599,-0.223,-0.224,-0.209,0.053,0.07
|
| 3 |
+
pm10,0.602,1,-0.146,0.504,0.085,-0.114,-0.245,-0.191,-0.116,-0.227,-0.184,-0.212,0.018,-0.182,-0.252,-0.229,0.052,0.036,-0.251,-0.199,-0.157,0.246,-0.175,-0.356,-0.097,-0.097,-0.074,0.113,0.02
|
| 4 |
+
o3,-0.239,-0.146,1,-0.505,0.001,0.599,0.412,0.555,0.577,0.42,0.533,0.356,-0.587,-0.048,-0.166,-0.133,-0.051,-0.053,-0.006,-0.018,-0.062,0.036,-0.275,0.418,0.636,0.636,0.608,0.073,0.029
|
| 5 |
+
no2,0.397,0.504,-0.505,1,0.012,-0.384,-0.453,-0.444,-0.377,-0.43,-0.424,-0.379,0.285,-0.113,-0.083,-0.076,0.038,0.039,-0.243,-0.209,-0.038,0.146,-0.088,-0.35,-0.372,-0.372,-0.358,-0.026,0.018
|
| 6 |
+
so2,0.044,0.085,0.001,0.012,1,0.044,0.023,0.037,0.052,0.036,0.046,0.045,-0.003,-0.021,-0.018,-0.022,-0.016,-0.033,-0.019,-0.045,0.082,0.03,-0.008,-0.103,0.038,0.037,0.058,,0.078
|
| 7 |
+
tempmax,-0.293,-0.114,0.599,-0.384,0.044,1,0.828,0.967,0.993,0.861,0.964,0.849,-0.494,-0.063,-0.203,-0.24,-0.165,-0.173,-0.139,-0.172,0,0.045,-0.242,0.353,0.726,0.726,0.711,0.182,0.026
|
| 8 |
+
tempmin,-0.434,-0.245,0.412,-0.453,0.023,0.828,1,0.934,0.829,0.984,0.923,0.935,-0.233,0.117,0.077,-0.002,-0.141,-0.179,0.063,-0.002,0.147,-0.158,0.107,0.284,0.477,0.477,0.459,0.146,0.017
|
| 9 |
+
temp,-0.38,-0.191,0.555,-0.444,0.037,0.967,0.934,1,0.963,0.949,0.993,0.925,-0.412,0.009,-0.09,-0.146,-0.162,-0.186,-0.052,-0.102,0.071,-0.04,-0.097,0.35,0.656,0.657,0.636,0.166,0.021
|
| 10 |
+
feelslikemax,-0.293,-0.116,0.577,-0.377,0.052,0.993,0.829,0.963,1,0.867,0.97,0.857,-0.467,-0.055,-0.197,-0.235,-0.175,-0.19,-0.152,-0.19,0.007,0.049,-0.221,0.337,0.715,0.715,0.703,0.181,0.026
|
| 11 |
+
feelslikemin,-0.419,-0.227,0.42,-0.43,0.036,0.861,0.984,0.949,0.867,1,0.952,0.946,-0.242,0.078,0.026,-0.057,-0.162,-0.192,-0.028,-0.098,0.144,-0.099,0.066,0.281,0.528,0.528,0.512,0.144,0.014
|
| 12 |
+
feelslike,-0.374,-0.184,0.533,-0.424,0.046,0.964,0.923,0.993,0.97,0.952,1,0.927,-0.389,0,-0.104,-0.159,-0.176,-0.198,-0.097,-0.153,0.076,-0.016,-0.096,0.334,0.66,0.66,0.642,0.159,0.019
|
| 13 |
+
dew,-0.361,-0.212,0.356,-0.379,0.045,0.849,0.935,0.925,0.857,0.946,0.927,1,-0.04,0.13,0.072,0.001,-0.15,-0.185,-0.063,-0.128,0.154,-0.12,0.103,0.127,0.43,0.43,0.416,0.156,0.024
|
| 14 |
+
humidity,0.177,0.018,-0.587,0.285,-0.003,-0.494,-0.233,-0.412,-0.467,-0.242,-0.389,-0.04,1,0.285,0.391,0.387,0.076,0.05,-0.065,-0.081,0.155,-0.171,0.472,-0.658,-0.698,-0.698,-0.685,-0.063,0.007
|
| 15 |
+
precip,-0.224,-0.182,-0.048,-0.113,-0.021,-0.063,0.117,0.009,-0.055,0.078,0,0.13,0.285,1,0.48,0.682,0.039,0.001,0.374,0.32,0.12,-0.403,0.278,-0.119,-0.219,-0.219,-0.223,-0.043,-0.021
|
| 16 |
+
precipprob,-0.24,-0.252,-0.166,-0.083,-0.018,-0.203,0.077,-0.09,-0.197,0.026,-0.104,0.072,0.391,0.48,1,0.642,0.069,0.012,0.433,0.37,0.279,-0.476,0.419,-0.113,-0.356,-0.356,-0.355,-0.005,-0.02
|
| 17 |
+
precipcover,-0.241,-0.229,-0.133,-0.076,-0.022,-0.24,-0.002,-0.146,-0.235,-0.057,-0.159,0.001,0.387,0.682,0.642,1,0.101,0.034,0.41,0.335,0.187,-0.466,0.378,-0.175,-0.375,-0.375,-0.387,-0.047,-0.032
|
| 18 |
+
snow,0.103,0.052,-0.051,0.038,-0.016,-0.165,-0.141,-0.162,-0.175,-0.162,-0.176,-0.15,0.076,0.039,0.069,0.101,1,0.346,0.027,0.041,-0.057,-0.061,0.056,-0.096,-0.097,-0.097,-0.102,-0.011,0.034
|
| 19 |
+
snowdepth,0.099,0.036,-0.053,0.039,-0.033,-0.173,-0.179,-0.186,-0.19,-0.192,-0.198,-0.185,0.05,0.001,0.012,0.034,0.346,1,-0.012,0.016,-0.062,-0.007,0.002,-0.042,-0.076,-0.076,-0.075,-0.015,0.016
|
| 20 |
+
windgust,-0.333,-0.251,-0.006,-0.243,-0.019,-0.139,0.063,-0.052,-0.152,-0.028,-0.097,-0.063,-0.065,0.374,0.433,0.41,0.027,-0.012,1,0.907,0.191,-0.418,0.201,0.173,-0.174,-0.174,-0.176,-0.022,-0.021
|
| 21 |
+
windspeed,-0.243,-0.199,-0.018,-0.209,-0.045,-0.172,-0.002,-0.102,-0.19,-0.098,-0.153,-0.128,-0.081,0.32,0.37,0.335,0.041,0.016,0.907,1,0.101,-0.374,0.136,0.148,-0.187,-0.187,-0.184,-0.045,-0.015
|
| 22 |
+
winddir,-0.225,-0.157,-0.062,-0.038,0.082,0,0.147,0.071,0.007,0.144,0.076,0.154,0.155,0.12,0.279,0.187,-0.057,-0.062,0.191,0.101,1,-0.096,0.224,-0.003,-0.029,-0.028,-0.037,0.003,-0.047
|
| 23 |
+
sealevelpressure,0.274,0.246,0.036,0.146,0.03,0.045,-0.158,-0.04,0.049,-0.099,-0.016,-0.12,-0.171,-0.403,-0.476,-0.466,-0.061,-0.007,-0.418,-0.374,-0.096,1,-0.325,-0.045,0.216,0.216,0.213,-0.032,-0.005
|
| 24 |
+
cloudcover,-0.106,-0.175,-0.275,-0.088,-0.008,-0.242,0.107,-0.097,-0.221,0.066,-0.096,0.103,0.472,0.278,0.419,0.378,0.056,0.002,0.201,0.136,0.224,-0.325,1,-0.208,-0.452,-0.452,-0.438,-0.063,-0.026
|
| 25 |
+
visibility,-0.599,-0.356,0.418,-0.35,-0.103,0.353,0.284,0.35,0.337,0.281,0.334,0.127,-0.658,-0.119,-0.113,-0.175,-0.096,-0.042,0.173,0.148,-0.003,-0.045,-0.208,1,0.45,0.45,0.447,0.033,-0.023
|
| 26 |
+
solarradiation,-0.223,-0.097,0.636,-0.372,0.038,0.726,0.477,0.656,0.715,0.528,0.66,0.43,-0.698,-0.219,-0.356,-0.375,-0.097,-0.076,-0.174,-0.187,-0.029,0.216,-0.452,0.45,1,1,0.965,0.118,0.007
|
| 27 |
+
solarenergy,-0.224,-0.097,0.636,-0.372,0.037,0.726,0.477,0.657,0.715,0.528,0.66,0.43,-0.698,-0.219,-0.356,-0.375,-0.097,-0.076,-0.174,-0.187,-0.028,0.216,-0.452,0.45,1,1,0.965,0.119,0.007
|
| 28 |
+
uvindex,-0.209,-0.074,0.608,-0.358,0.058,0.711,0.459,0.636,0.703,0.512,0.642,0.416,-0.685,-0.223,-0.355,-0.387,-0.102,-0.075,-0.176,-0.184,-0.037,0.213,-0.438,0.447,0.965,0.965,1,0.115,0.004
|
| 29 |
+
severerisk,0.053,0.113,0.073,-0.026,,0.182,0.146,0.166,0.181,0.144,0.159,0.156,-0.063,-0.043,-0.005,-0.047,-0.011,-0.015,-0.022,-0.045,0.003,-0.032,-0.063,0.033,0.118,0.119,0.115,1,0.023
|
| 30 |
+
moonphase,0.07,0.02,0.029,0.018,0.078,0.026,0.017,0.021,0.026,0.014,0.019,0.024,0.007,-0.021,-0.02,-0.032,0.034,0.016,-0.021,-0.015,-0.047,-0.005,-0.026,-0.023,0.007,0.007,0.004,0.023,1
|
extra_scripts/corr_map.R
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
library(ggplot2)
|
| 2 |
+
library(reshape2)
|
| 3 |
+
library(dplyr)
|
| 4 |
+
library(extrafont)
|
| 5 |
+
|
| 6 |
+
loadfonts(device = "pdf") # Load fonts
|
| 7 |
+
|
| 8 |
+
correlation_matrix <- read.csv(file.choose(), row.names = 1)
|
| 9 |
+
|
| 10 |
+
correlation_matrix[abs(correlation_matrix) < 0.3] <- NA
|
| 11 |
+
|
| 12 |
+
correlation_matrix[lower.tri(correlation_matrix)] <- NA
|
| 13 |
+
diag(correlation_matrix) <- NA
|
| 14 |
+
|
| 15 |
+
melted_cor <- melt(as.matrix(correlation_matrix), na.rm = TRUE)
|
| 16 |
+
|
| 17 |
+
melted_cor$fill_value <- ifelse(is.na(melted_cor$value), NA, melted_cor$value)
|
| 18 |
+
|
| 19 |
+
ggplot(melted_cor, aes(x = Var1, y = Var2)) +
|
| 20 |
+
geom_tile(aes(fill = fill_value), color = "black") +
|
| 21 |
+
labs(x = NULL, y = NULL, fill = "Pearson's\nCorrelation") +
|
| 22 |
+
scale_fill_gradient2(mid = "#FBFEF9", low = "#0C6291", high = "#A63446",
|
| 23 |
+
limits = c(-1, 1), na.value = "lightgray") +
|
| 24 |
+
theme_classic() +
|
| 25 |
+
scale_x_discrete(expand = c(0, 0)) +
|
| 26 |
+
scale_y_discrete(expand = c(0, 0)) +
|
| 27 |
+
theme(text = element_text(family = "Arial"), # Use a common font
|
| 28 |
+
plot.title = element_text(size = 14, family = "Arial"), # Title font
|
| 29 |
+
axis.text.x = element_text(angle = 45, hjust = 1),
|
| 30 |
+
legend.position = "right", # Position of the legend
|
| 31 |
+
legend.title = element_text(size = 12), # Title size
|
| 32 |
+
legend.text = element_text(size = 10), # Text size
|
| 33 |
+
legend.key.size = unit(1.5, "cm"), # Size of the legend keys
|
| 34 |
+
legend.key.width = unit(1, "cm")) # Width of the legend keys
|
extra_scripts/histograms.tex
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\documentclass[tikz]{standalone}
|
| 2 |
+
\usepackage{pgfplots}
|
| 3 |
+
\usepackage{pgfplotstable}
|
| 4 |
+
\pgfplotsset{compat=1.17}
|
| 5 |
+
|
| 6 |
+
\begin{document}
|
| 7 |
+
|
| 8 |
+
% Load the dataset
|
| 9 |
+
\pgfplotstableread[col sep=comma]{data/processed/v2_merged_selected_features_with_missing.csv}\datatable
|
| 10 |
+
|
| 11 |
+
\begin{tikzpicture}
|
| 12 |
+
|
| 13 |
+
% Main title at the top
|
| 14 |
+
\node[align=center] at (7, 3.75) {\textbf{Distributions of Environmental Variables}};
|
| 15 |
+
|
| 16 |
+
% First row of plots
|
| 17 |
+
\begin{axis}[
|
| 18 |
+
at={(0,0)},
|
| 19 |
+
width=5.5cm,
|
| 20 |
+
xlabel=PM$_{2.5}$ ($\mu g /m^3$),
|
| 21 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 22 |
+
ylabel=Frequency,
|
| 23 |
+
ybar=0pt, bar width=1,
|
| 24 |
+
]
|
| 25 |
+
\addplot+[fill=cyan,
|
| 26 |
+
fill opacity=0.5,
|
| 27 |
+
hist={bins=20}
|
| 28 |
+
] table [y index=1] {\datatable};
|
| 29 |
+
\end{axis}
|
| 30 |
+
|
| 31 |
+
\begin{axis}[
|
| 32 |
+
at={(5cm,0)},
|
| 33 |
+
width=5.5cm,
|
| 34 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 35 |
+
xlabel=PM$_{10}$ ($\mu g /m^3$),
|
| 36 |
+
ybar=0pt, bar width=1,
|
| 37 |
+
]
|
| 38 |
+
\addplot+[fill=cyan,
|
| 39 |
+
fill opacity=0.5,
|
| 40 |
+
hist={bins=20}
|
| 41 |
+
] table [y index=2] {\datatable};
|
| 42 |
+
\end{axis}
|
| 43 |
+
|
| 44 |
+
\begin{axis}[
|
| 45 |
+
at={(10cm,0)},
|
| 46 |
+
width=5.5cm,
|
| 47 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 48 |
+
xlabel=O$_{3}$ ($\mu g /m^3$),
|
| 49 |
+
ybar=0pt, bar width=1,
|
| 50 |
+
]
|
| 51 |
+
\addplot+[fill=cyan,
|
| 52 |
+
fill opacity=0.5,
|
| 53 |
+
hist={bins=20}
|
| 54 |
+
] table [y index=3] {\datatable};
|
| 55 |
+
\end{axis}
|
| 56 |
+
|
| 57 |
+
% Second row of plots
|
| 58 |
+
\begin{axis}[
|
| 59 |
+
at={(0,-5cm)},
|
| 60 |
+
width=5.5cm,
|
| 61 |
+
xlabel=NO$_{2}$ ($\mu g /m^3$),
|
| 62 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 63 |
+
ylabel=Frequency,
|
| 64 |
+
ybar=0pt, bar width=1,
|
| 65 |
+
]
|
| 66 |
+
\addplot+[fill=cyan,
|
| 67 |
+
fill opacity=0.5,
|
| 68 |
+
hist={bins=20}
|
| 69 |
+
] table [y index=4] {\datatable};
|
| 70 |
+
\end{axis}
|
| 71 |
+
|
| 72 |
+
\begin{axis}[
|
| 73 |
+
at={(5cm,-5cm)},
|
| 74 |
+
width=5.5cm,
|
| 75 |
+
xlabel=Temperature (°C),
|
| 76 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 77 |
+
ybar=0pt, bar width=1,
|
| 78 |
+
]
|
| 79 |
+
\addplot+[fill=cyan,
|
| 80 |
+
fill opacity=0.5,
|
| 81 |
+
hist={bins=20}
|
| 82 |
+
] table [y index=5] {\datatable};
|
| 83 |
+
\end{axis}
|
| 84 |
+
|
| 85 |
+
\begin{axis}[
|
| 86 |
+
at={(10cm,-5cm)},
|
| 87 |
+
width=5.5cm,
|
| 88 |
+
xlabel=Humidity (\%),
|
| 89 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 90 |
+
ybar=0pt, bar width=1,
|
| 91 |
+
]
|
| 92 |
+
\addplot+[fill=cyan,
|
| 93 |
+
fill opacity=0.5,
|
| 94 |
+
hist={bins=20}
|
| 95 |
+
] table [y index=6] {\datatable};
|
| 96 |
+
\end{axis}
|
| 97 |
+
|
| 98 |
+
% Third row of plots
|
| 99 |
+
\begin{axis}[
|
| 100 |
+
at={(0,-10cm)},
|
| 101 |
+
width=5.5cm,
|
| 102 |
+
xlabel=Visibility ($km$),
|
| 103 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 104 |
+
ylabel=Frequency,
|
| 105 |
+
ybar=0pt, bar width=1,
|
| 106 |
+
]
|
| 107 |
+
\addplot+[fill=cyan,
|
| 108 |
+
fill opacity=0.5,
|
| 109 |
+
hist={bins=20}
|
| 110 |
+
] table [y index=7] {\datatable};
|
| 111 |
+
\end{axis}
|
| 112 |
+
|
| 113 |
+
\begin{axis}[
|
| 114 |
+
at={(5cm,-10cm)},
|
| 115 |
+
width=5.5cm,
|
| 116 |
+
xlabel=Solar Radiation ($W/m^2$),
|
| 117 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 118 |
+
ybar=0pt, bar width=1,
|
| 119 |
+
]
|
| 120 |
+
\addplot+[fill=cyan,
|
| 121 |
+
fill opacity=0.5,
|
| 122 |
+
hist={bins=20}
|
| 123 |
+
] table [y index=8] {\datatable};
|
| 124 |
+
\end{axis}
|
| 125 |
+
|
| 126 |
+
\begin{axis}[
|
| 127 |
+
at={(10cm,-10cm)},
|
| 128 |
+
width=5.5cm,
|
| 129 |
+
xlabel=Precipitation ($mm$),
|
| 130 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 131 |
+
ybar=0pt, bar width=1,
|
| 132 |
+
]
|
| 133 |
+
\addplot+[fill=cyan,
|
| 134 |
+
fill opacity=0.5,
|
| 135 |
+
hist={bins=20}
|
| 136 |
+
] table [y index=9] {\datatable};
|
| 137 |
+
\end{axis}
|
| 138 |
+
|
| 139 |
+
\begin{axis}[
|
| 140 |
+
at={(2cm,-15cm)},
|
| 141 |
+
width=5.5cm,
|
| 142 |
+
xlabel=Windspeed ($km/h$),
|
| 143 |
+
ylabel=Frequency,
|
| 144 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 145 |
+
ybar=0pt, bar width=1,
|
| 146 |
+
]
|
| 147 |
+
\addplot+[fill=cyan,
|
| 148 |
+
fill opacity=0.5,
|
| 149 |
+
hist={bins=20}
|
| 150 |
+
] table [y index=10] {\datatable};
|
| 151 |
+
\end{axis}
|
| 152 |
+
|
| 153 |
+
\begin{axis}[
|
| 154 |
+
at={(8cm,-15cm)},
|
| 155 |
+
width=5.5cm,
|
| 156 |
+
xlabel=Wind Direction (degrees),
|
| 157 |
+
tick label style={font=\fontsize{8}{8}\selectfont},
|
| 158 |
+
ybar=0pt, bar width=1,
|
| 159 |
+
]
|
| 160 |
+
\addplot+[fill=cyan,
|
| 161 |
+
fill opacity=0.5,
|
| 162 |
+
hist={bins=20}
|
| 163 |
+
] table [y index=11] {\datatable};
|
| 164 |
+
\end{axis}
|
| 165 |
+
|
| 166 |
+
\end{tikzpicture}
|
| 167 |
+
\end{document}
|