#!/usr/bin/env python # coding: utf-8 # # Parsing daily climate data in a xlsx, downloaded from BMKG's Data Online # # If you have spreadsheet file of daily climate data, downloaded from [BMKG Data Online](https://dataonline.bmkg.go.id/data_iklim). then you need to parse it before the data ready to use. # # Somehow, the file's structure is a bit unconventional for a typical Excel sheet. The values for ID WMO, Nama Stasiun, Lintang, etc. are actually in the header names themselves, not in the cell values under those headers. The climate data itself started at row 9 for the header. # # See below screenshot! # # ![image.png](attachment:image.png) # # ## Extract weather station information from data collection # # Currently I have around 180 xlsx files (1 xlsx for 1 station) in a folder. Let's extract the station information from each file, compile it, and save it as single csv file. # In[33]: import pandas as pd import os import re from tqdm import tqdm # Function to extract data after the colon def get_data_after_colon(cell_value): match = re.search(r':\s+(.*)', str(cell_value)) if match: return match.group(1).strip() else: return str(cell_value).strip() # Function to extract only numeric characters def get_only_numbers(cell_value): return ''.join(re.findall(r'\d+', str(cell_value))) # Directory containing the .xlsx files folder_path = './input/BMKG/' # Get a list of all xlsx files in the directory xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')] all_data = [] # Loop through each file and extract data for file in tqdm(xlsx_files, desc="Processing files"): try: # Load the spreadsheet xls = pd.ExcelFile(folder_path + file) sheet = xls.parse(xls.sheet_names[0]) # Extract data ID_WMO = get_only_numbers(sheet.columns[2]) # From column header Station = get_data_after_colon(sheet.iloc[0, 2]) Lat = float(get_data_after_colon(sheet.iloc[1, 2])) Lon = float(get_data_after_colon(sheet.iloc[2, 2])) Elevation = float(get_data_after_colon(sheet.iloc[3, 2])) all_data.append([ID_WMO, Station, Lon, Lat, Elevation]) except Exception as e: print(f"Error processing file {file}: {e}") # Convert to dataframe df = pd.DataFrame(all_data, columns=['ID_WMO', 'Station', 'Lon', 'Lat', 'Elevation']) # Sort by ID_WMO df = df.sort_values(by=['ID_WMO']) # Add ID column as index df = df.reset_index(drop=True) df['ID'] = df.index + 1 # Reorder columns df = df[['ID', 'ID_WMO', 'Station', 'Lon', 'Lat', 'Elevation']] # Save to csv df.to_csv('./output/idn_cli_weatherstation_location_bmkg.csv', index=False) # Preview the output df # ## Plot the station # # Let's plot the station coordinate into a map, to see the spatial distribution accross the country # In[50]: import pandas as pd import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap # Load the CSV data df = pd.read_csv('./output/idn_cli_weatherstation_location_bmkg.csv') # Initialize a new map fig, ax = plt.subplots(figsize=(10,15)) m = Basemap(resolution='i', projection='merc', llcrnrlat=-11, urcrnrlat=6, llcrnrlon=95, urcrnrlon=141, ax=ax) m.drawcoastlines() m.drawcountries() m.drawmapboundary(fill_color='white') # Set sea color to white m.fillcontinents(color='grey',lake_color='white') # Set land color to grey # Plot each weather station on the map for _, row in df.iterrows(): x, y = m(row['Lon'], row['Lat']) m.plot(x, y, 'ro', markersize=5) # Change marker to red #plt.text(x, y, row['ID_WMO'], fontsize=9) # Save the map as a PNG plt.savefig('./output/idn_cli_weatherstation_map_bmkg.png', dpi=300) # Show the map plt.show() # ## Extract climate variables # # Next, extract climate variables (in this case is RR or Rainfall), from each xlsx and compile it as single csv file. # # For this case, I would like to extract daily rainfall from each of station with start date 1 Jun 2000, and arrange in long format with column structure: ID, Date, Julian Date, ID_WMO(1), ID_WMO(2), ..., ID_WMO(n). I'd like to pivot the data so that each unique ID_WMO value forms a column in the resultant DataFrame. The RR values will populate these columns. # In[36]: import pandas as pd import os from tqdm import tqdm # Function to convert a date in DD-MM-YYYY format to its Julian date def to_julian_date(date_string): try: date_obj = pd.to_datetime(date_string, format='%d-%m-%Y') return int(date_obj.strftime('%j')) except: return None # Directory containing the .xlsx files folder_path = './input/BMKG/' # Get a list of all xlsx files in the directory xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')] # Dictionary to store the data data_dict = {"ID": [], "Date": [], "JD": []} # Loop through each file and extract data for file in tqdm(xlsx_files, desc="Processing files"): try: # Load the spreadsheet xls = pd.ExcelFile(folder_path + file) sheet = xls.parse(xls.sheet_names[0], skiprows=8) # Start reading from row 9 # Extract ID_WMO value from column C1 ID_WMO = ''.join(filter(str.isdigit, xls.parse(xls.sheet_names[0]).columns[2])) # Check and initialize ID_WMO in data_dict if not already present if ID_WMO not in data_dict: data_dict[ID_WMO] = [None for _ in range(len(data_dict['ID']))] # Loop through each row in the sheet starting from the 10th row for index, row in sheet.iterrows(): date_string = row['Tanggal'] # Check if date_string is empty, and if so, break out of the loop for this sheet if pd.isna(date_string): break # Check if the date is 01-06-2000 or later if pd.to_datetime(date_string, format='%d-%m-%Y') >= pd.to_datetime('01-06-2000', format='%d-%m-%Y'): julian_date = to_julian_date(date_string) rr_value = row['RR'] # If date_string is new, append to data_dict if date_string not in data_dict["Date"]: data_dict["ID"].append(index + 1) data_dict["Date"].append(date_string) data_dict["JD"].append(julian_date) for key in data_dict: if key not in ["ID", "Date", "JD"]: data_dict[key].append(None) # Add None for other ID_WMOs # Update the RR value for the current ID_WMO idx = data_dict["Date"].index(date_string) data_dict[ID_WMO][idx] = rr_value except Exception as e: print(f"Error processing file {file}: {e}") # Convert data dictionary to dataframe df = pd.DataFrame(data_dict) # Save to csv df.to_csv('./output/idn_cli_weatherstation_data_bmkg.csv', index=False) # Preview the output df # ## Annual summary of the data availability # # Let's calculate number of day per year where the data is NaN, so we can see if this is good enough reason to drop the data or not. # In[45]: import pandas as pd # Load the previously generated CSV file df = pd.read_csv('./output/idn_cli_weatherstation_data_bmkg.csv') # Extract the year from the Date column df['Year'] = pd.to_datetime(df['Date'], format='%d-%m-%Y').dt.year # Create a dataframe to store annual NaN counts nan_count_df = df.groupby('Year').apply(lambda x: x.isna().sum()).drop(columns=['ID', 'Date', 'JD', 'Year']) # Reset the index for the new dataframe nan_count_df = nan_count_df.reset_index() # Sort columns by their names (ID_WMO values) sorted_columns = sorted(nan_count_df.columns[1:], key=lambda x: int(x)) # Convert ID_WMO to integers for sorting nan_count_df = nan_count_df[['Year'] + sorted_columns] # Display the result print(nan_count_df) # Save the summarized dataframe to CSV nan_count_df.to_csv('./output/idn_cli_annual_nan_precip_count_summary_bmkg.csv', index=False) # ## Heat map # # Let's visualise as a heat map. # # Each column represents a year. Each row represents an ID_WMO. Color intensity indicates the count of NaN values. # # This will let you quickly spot patterns across years and stations # In[43]: import pandas as pd import matplotlib.pyplot as plt import numpy as np # Read the csv file df = pd.read_csv('./output/idn_cli_annual_nan_precip_count_summary_bmkg.csv') # Set the 'Year' column as the index df.set_index('Year', inplace=True) # Check and drop the 'ID' column if it exists if 'ID' in df.columns: df.drop('ID', axis=1, inplace=True) # Transpose the DataFrame df = df.transpose() # Sort the DataFrame by its index (ID_WMO) df = df.sort_index() # Generate the heatmap using only matplotlib fig, ax = plt.subplots(figsize=(20, 40)) # Plot heatmap using pcolormesh cax = ax.pcolormesh(df, cmap="viridis", shading="auto") # Create a colorbar and adjust its height cbar = fig.colorbar(cax, ax=ax, aspect=50) cbar.ax.tick_params(labelsize=14) # Font size for colorbar tick labels cbar.set_label('Number of NaN Days', size=16) # Font size for colorbar label # Set x-axis labels to be the columns of df (i.e., years) ax.set_xticks(np.arange(len(df.columns)) + 0.5) ax.set_xticklabels(df.columns, rotation=45, ha='right', fontsize=14) # Font size for x-axis labels # Set y-axis labels to be the index of df (i.e., the ID_WMO values) ax.set_yticks(np.arange(len(df.index)) + 0.5) ax.set_yticklabels(df.index, fontsize=14) # Font size for y-axis labels # Set title and x/y axis labels with specified font sizes ax.set_title("Number of Days with NaN Values per Year", fontsize=20) ax.set_xlabel("Year", fontsize=16) ax.set_ylabel("ID_WMO", fontsize=16) # Save the map as a PNG plt.savefig('./output/idn_cli_weatherstation_heatmap_nancount_bmkg.png', dpi=300) plt.tight_layout() plt.show() # In[ ]: