If you have spreadsheet file of daily climate data, downloaded from BMKG Data Online. then you need to parse it before the data ready to use.
Somehow, the file's structure is a bit unconventional for a typical Excel sheet. The values for ID WMO, Nama Stasiun, Lintang, etc. are actually in the header names themselves, not in the cell values under those headers. The climate data itself started at row 9 for the header.
See below screenshot!
Currently I have around 180 xlsx files (1 xlsx for 1 station) in a folder. Let's extract the station information from each file, compile it, and save it as single csv file.
import pandas as pd
import os
import re
from tqdm import tqdm
# Function to extract data after the colon
def get_data_after_colon(cell_value):
match = re.search(r':\s+(.*)', str(cell_value))
if match:
return match.group(1).strip()
else:
return str(cell_value).strip()
# Function to extract only numeric characters
def get_only_numbers(cell_value):
return ''.join(re.findall(r'\d+', str(cell_value)))
# Directory containing the .xlsx files
folder_path = './input/BMKG/'
# Get a list of all xlsx files in the directory
xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
all_data = []
# Loop through each file and extract data
for file in tqdm(xlsx_files, desc="Processing files"):
try:
# Load the spreadsheet
xls = pd.ExcelFile(folder_path + file)
sheet = xls.parse(xls.sheet_names[0])
# Extract data
ID_WMO = get_only_numbers(sheet.columns[2]) # From column header
Station = get_data_after_colon(sheet.iloc[0, 2])
Lat = float(get_data_after_colon(sheet.iloc[1, 2]))
Lon = float(get_data_after_colon(sheet.iloc[2, 2]))
Elevation = float(get_data_after_colon(sheet.iloc[3, 2]))
all_data.append([ID_WMO, Station, Lon, Lat, Elevation])
except Exception as e:
print(f"Error processing file {file}: {e}")
# Convert to dataframe
df = pd.DataFrame(all_data, columns=['ID_WMO', 'Station', 'Lon', 'Lat', 'Elevation'])
# Sort by ID_WMO
df = df.sort_values(by=['ID_WMO'])
# Add ID column as index
df = df.reset_index(drop=True)
df['ID'] = df.index + 1
# Reorder columns
df = df[['ID', 'ID_WMO', 'Station', 'Lon', 'Lat', 'Elevation']]
# Save to csv
df.to_csv('./output/idn_cli_weatherstation_location_bmkg.csv', index=False)
# Preview the output
df
Processing files: 100%|██████████████████████████████████████████████| 182/182 [05:40<00:00, 1.87s/it]
Error processing file ~$Balai Besar Meteorologi Klimatologi dan Geofisika Wilayah I.xlsx: Excel file format cannot be determined, you must specify an engine manually. Error processing file ~$Halim Perdana Kusuma Jakarta.xlsx: Excel file format cannot be determined, you must specify an engine manually.
ID | ID_WMO | Station | Lon | Lat | Elevation | |
---|---|---|---|---|---|---|
0 | 1 | 96001 | Stasiun Meteorologi Maimun Saleh | 95.33785 | 5.87655 | 126.0 |
1 | 2 | 96009 | Stasiun Meteorologi Malikussaleh | 96.94749 | 5.22869 | 28.0 |
2 | 3 | 96011 | Stasiun Meteorologi Sultan Iskandar Muda | 95.41700 | 5.52244 | 20.0 |
3 | 4 | 96013 | Stasiun Geofisika Aceh Besar | 95.29600 | 5.49600 | 32.0 |
4 | 5 | 96015 | Stasiun Meteorologi Cut Nyak Dhien Nagan Raya | 96.24796 | 4.04928 | 3.0 |
... | ... | ... | ... | ... | ... | ... |
175 | 176 | 97812 | Stasiun Geofisika Nabire | 132.75000 | -5.64000 | 22.0 |
176 | 177 | 97876 | Stasiun Meteorologi Tanah Merah | 140.31000 | -6.10000 | 97.0 |
177 | 178 | 97900 | Stasiun Meteorologi Mathilda Batlayeri | 131.30000 | -7.98000 | 24.0 |
178 | 179 | 97978 | Stasiun Klimatologi Merauke | 140.51700 | -8.38700 | 5.0 |
179 | 180 | 97980 | Stasiun Meteorologi Mopah | 140.41568 | -8.52019 | 0.0 |
180 rows × 6 columns
Let's plot the station coordinate into a map, to see the spatial distribution accross the country
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
# Load the CSV data
df = pd.read_csv('./output/idn_cli_weatherstation_location_bmkg.csv')
# Initialize a new map
fig, ax = plt.subplots(figsize=(10,15))
m = Basemap(resolution='i', projection='merc', llcrnrlat=-11, urcrnrlat=6, llcrnrlon=95, urcrnrlon=141, ax=ax)
m.drawcoastlines()
m.drawcountries()
m.drawmapboundary(fill_color='white') # Set sea color to white
m.fillcontinents(color='grey',lake_color='white') # Set land color to grey
# Plot each weather station on the map
for _, row in df.iterrows():
x, y = m(row['Lon'], row['Lat'])
m.plot(x, y, 'ro', markersize=5) # Change marker to red
#plt.text(x, y, row['ID_WMO'], fontsize=9)
# Save the map as a PNG
plt.savefig('./output/idn_cli_weatherstation_map_bmkg.png', dpi=300)
# Show the map
plt.show()
Next, extract climate variables (in this case is RR or Rainfall), from each xlsx and compile it as single csv file.
For this case, I would like to extract daily rainfall from each of station with start date 1 Jun 2000, and arrange in long format with column structure: ID, Date, Julian Date, ID_WMO(1), ID_WMO(2), ..., ID_WMO(n). I'd like to pivot the data so that each unique ID_WMO value forms a column in the resultant DataFrame. The RR values will populate these columns.
import pandas as pd
import os
from tqdm import tqdm
# Function to convert a date in DD-MM-YYYY format to its Julian date
def to_julian_date(date_string):
try:
date_obj = pd.to_datetime(date_string, format='%d-%m-%Y')
return int(date_obj.strftime('%j'))
except:
return None
# Directory containing the .xlsx files
folder_path = './input/BMKG/'
# Get a list of all xlsx files in the directory
xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
# Dictionary to store the data
data_dict = {"ID": [], "Date": [], "JD": []}
# Loop through each file and extract data
for file in tqdm(xlsx_files, desc="Processing files"):
try:
# Load the spreadsheet
xls = pd.ExcelFile(folder_path + file)
sheet = xls.parse(xls.sheet_names[0], skiprows=8) # Start reading from row 9
# Extract ID_WMO value from column C1
ID_WMO = ''.join(filter(str.isdigit, xls.parse(xls.sheet_names[0]).columns[2]))
# Check and initialize ID_WMO in data_dict if not already present
if ID_WMO not in data_dict:
data_dict[ID_WMO] = [None for _ in range(len(data_dict['ID']))]
# Loop through each row in the sheet starting from the 10th row
for index, row in sheet.iterrows():
date_string = row['Tanggal']
# Check if date_string is empty, and if so, break out of the loop for this sheet
if pd.isna(date_string):
break
# Check if the date is 01-06-2000 or later
if pd.to_datetime(date_string, format='%d-%m-%Y') >= pd.to_datetime('01-06-2000', format='%d-%m-%Y'):
julian_date = to_julian_date(date_string)
rr_value = row['RR']
# If date_string is new, append to data_dict
if date_string not in data_dict["Date"]:
data_dict["ID"].append(index + 1)
data_dict["Date"].append(date_string)
data_dict["JD"].append(julian_date)
for key in data_dict:
if key not in ["ID", "Date", "JD"]:
data_dict[key].append(None) # Add None for other ID_WMOs
# Update the RR value for the current ID_WMO
idx = data_dict["Date"].index(date_string)
data_dict[ID_WMO][idx] = rr_value
except Exception as e:
print(f"Error processing file {file}: {e}")
# Convert data dictionary to dataframe
df = pd.DataFrame(data_dict)
# Save to csv
df.to_csv('./output/idn_cli_weatherstation_data_bmkg.csv', index=False)
# Preview the output
df
Processing files: 100%|██████████████████████████████████████████████| 182/182 [35:40<00:00, 11.76s/it]
Error processing file ~$Balai Besar Meteorologi Klimatologi dan Geofisika Wilayah I.xlsx: Excel file format cannot be determined, you must specify an engine manually. Error processing file ~$Halim Perdana Kusuma Jakarta.xlsx: Excel file format cannot be determined, you must specify an engine manually.
ID | Date | JD | 96041 | 96747 | 96793 | 97234 | 96013 | 96039 | 97728 | ... | 96655 | 97630 | 96939 | 96805 | 97340 | 97760 | 97686 | 96505 | 97240 | 96169 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7458 | 01-06-2000 | 153 | NaN | NaN | NaN | 0.0 | NaN | 0.0 | NaN | ... | 0.5 | 9.0 | NaN | 0.0 | 0.0 | NaN | NaN | NaN | 0.0 | NaN |
1 | 7459 | 02-06-2000 | 154 | NaN | NaN | NaN | 0.0 | NaN | 0.3 | NaN | ... | 13.2 | 33.0 | NaN | 0.0 | 0.0 | NaN | NaN | NaN | 0.0 | NaN |
2 | 7460 | 03-06-2000 | 155 | NaN | NaN | NaN | 0.0 | NaN | 0.0 | NaN | ... | 3.6 | 7.0 | NaN | 4.9 | 0.0 | NaN | NaN | NaN | 2.0 | NaN |
3 | 7461 | 04-06-2000 | 156 | NaN | NaN | NaN | 1.0 | NaN | 0.0 | NaN | ... | 0.0 | 17.0 | NaN | 40.3 | 0.0 | NaN | NaN | NaN | 0.0 | NaN |
4 | 7462 | 05-06-2000 | 157 | NaN | 11.2 | NaN | 0.0 | NaN | 0.0 | NaN | ... | 0.0 | 4.0 | NaN | 0.0 | 0.0 | NaN | NaN | NaN | 0.0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7879 | 15337 | 27-12-2021 | 361 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 12.5 | 1.7 | 56.0 | 8888.0 | 8888.0 | 0.2 | 4.0 | 0.2 | 2.5 | NaN |
7880 | 15338 | 28-12-2021 | 362 | 8888.0 | NaN | 26.0 | 5.7 | NaN | NaN | NaN | ... | 2.6 | 13.2 | 10.0 | 20.5 | 40.3 | 3.2 | 8888.0 | 17.8 | 9.7 | NaN |
7881 | 15339 | 29-12-2021 | 363 | 0.0 | NaN | 18.5 | 11.0 | NaN | NaN | NaN | ... | 0.0 | 40.6 | 25.0 | 3.9 | 9.1 | 11.9 | 17.8 | 27.4 | 13.0 | NaN |
7882 | 15340 | 30-12-2021 | 364 | 0.0 | NaN | 0.4 | NaN | NaN | NaN | NaN | ... | 3.8 | 0.0 | 0.0 | 1.4 | 17.2 | NaN | 1.4 | NaN | 0.0 | NaN |
7883 | 15341 | 31-12-2021 | 365 | 8.3 | NaN | 0.6 | 17.7 | NaN | NaN | NaN | ... | 4.0 | 0.2 | 8888.0 | 2.9 | 2.9 | 0.0 | 9.3 | 7.5 | NaN | NaN |
7884 rows × 183 columns
Let's calculate number of day per year where the data is NaN, so we can see if this is good enough reason to drop the data or not.
import pandas as pd
# Load the previously generated CSV file
df = pd.read_csv('./output/idn_cli_weatherstation_data_bmkg.csv')
# Extract the year from the Date column
df['Year'] = pd.to_datetime(df['Date'], format='%d-%m-%Y').dt.year
# Create a dataframe to store annual NaN counts
nan_count_df = df.groupby('Year').apply(lambda x: x.isna().sum()).drop(columns=['ID', 'Date', 'JD', 'Year'])
# Reset the index for the new dataframe
nan_count_df = nan_count_df.reset_index()
# Sort columns by their names (ID_WMO values)
sorted_columns = sorted(nan_count_df.columns[1:], key=lambda x: int(x)) # Convert ID_WMO to integers for sorting
nan_count_df = nan_count_df[['Year'] + sorted_columns]
# Display the result
print(nan_count_df)
# Save the summarized dataframe to CSV
nan_count_df.to_csv('./output/idn_cli_annual_nan_precip_count_summary_bmkg.csv', index=False)
Year 96001 96009 96011 96013 96015 96017 96031 96033 96037 ... \ 0 2000 214 0 0 214 214 0 0 214 0 ... 1 2001 365 0 0 365 365 0 0 365 0 ... 2 2002 365 0 0 365 365 0 0 365 0 ... 3 2003 365 0 0 365 365 0 0 365 0 ... 4 2004 366 0 0 366 366 0 31 366 0 ... 5 2005 365 0 0 365 365 0 0 365 0 ... 6 2006 365 0 0 365 365 0 0 365 0 ... 7 2007 365 0 0 365 365 0 0 365 0 ... 8 2008 366 0 0 366 366 0 0 366 0 ... 9 2009 365 0 0 365 365 0 0 365 0 ... 10 2010 0 0 0 365 0 0 12 0 0 ... 11 2011 0 0 0 365 0 0 0 0 0 ... 12 2012 1 335 6 366 0 1 95 11 2 ... 13 2013 0 160 180 365 0 0 0 24 84 ... 14 2014 187 199 1 341 2 0 0 0 1 ... 15 2015 188 46 235 30 86 121 222 203 233 ... 16 2016 54 8 200 58 16 118 117 47 173 ... 17 2017 171 3 226 53 97 133 106 69 138 ... 18 2018 202 33 224 52 117 130 147 96 135 ... 19 2019 35 57 122 200 98 70 66 115 109 ... 20 2020 51 68 145 366 93 90 118 48 120 ... 21 2021 61 3 130 365 71 67 154 4 117 ... 97760 97780 97790 97796 97810 97812 97876 97900 97978 97980 0 214 0 0 214 214 214 214 0 214 214 1 365 61 0 1 365 365 0 0 365 0 2 365 0 31 1 365 365 123 0 365 0 3 365 272 31 0 365 365 1 0 365 0 4 366 122 0 0 366 366 0 0 366 0 5 365 365 0 0 365 365 0 0 365 0 6 365 0 0 0 365 365 0 0 365 0 7 365 92 0 0 365 365 31 0 365 0 8 366 275 0 0 366 366 29 0 366 0 9 365 48 0 1 365 365 37 0 365 91 10 365 132 0 0 0 365 120 0 365 30 11 365 79 0 0 0 365 61 0 365 0 12 366 184 0 0 5 366 31 0 366 31 13 364 149 322 46 182 311 96 73 365 0 14 235 0 335 40 62 72 245 32 365 0 15 181 274 250 58 186 159 364 177 365 349 16 129 232 110 48 8 366 127 91 366 196 17 141 161 102 27 51 365 164 89 365 29 18 163 4 123 35 38 365 193 180 365 85 19 130 140 131 52 5 365 201 365 227 91 20 82 115 92 28 57 366 245 43 242 46 21 23 34 14 14 5 365 38 16 155 28 [22 rows x 181 columns]
Let's visualise as a heat map.
Each column represents a year. Each row represents an ID_WMO. Color intensity indicates the count of NaN values.
This will let you quickly spot patterns across years and stations
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Read the csv file
df = pd.read_csv('./output/idn_cli_annual_nan_precip_count_summary_bmkg.csv')
# Set the 'Year' column as the index
df.set_index('Year', inplace=True)
# Check and drop the 'ID' column if it exists
if 'ID' in df.columns:
df.drop('ID', axis=1, inplace=True)
# Transpose the DataFrame
df = df.transpose()
# Sort the DataFrame by its index (ID_WMO)
df = df.sort_index()
# Generate the heatmap using only matplotlib
fig, ax = plt.subplots(figsize=(20, 40))
# Plot heatmap using pcolormesh
cax = ax.pcolormesh(df, cmap="viridis", shading="auto")
# Create a colorbar and adjust its height
cbar = fig.colorbar(cax, ax=ax, aspect=50)
cbar.ax.tick_params(labelsize=14) # Font size for colorbar tick labels
cbar.set_label('Number of NaN Days', size=16) # Font size for colorbar label
# Set x-axis labels to be the columns of df (i.e., years)
ax.set_xticks(np.arange(len(df.columns)) + 0.5)
ax.set_xticklabels(df.columns, rotation=45, ha='right', fontsize=14) # Font size for x-axis labels
# Set y-axis labels to be the index of df (i.e., the ID_WMO values)
ax.set_yticks(np.arange(len(df.index)) + 0.5)
ax.set_yticklabels(df.index, fontsize=14) # Font size for y-axis labels
# Set title and x/y axis labels with specified font sizes
ax.set_title("Number of Days with NaN Values per Year", fontsize=20)
ax.set_xlabel("Year", fontsize=16)
ax.set_ylabel("ID_WMO", fontsize=16)
# Save the map as a PNG
plt.savefig('./output/idn_cli_weatherstation_heatmap_nancount_bmkg.png', dpi=300)
plt.tight_layout()
plt.show()