#!/usr/bin/env python
# coding: utf-8

# # Parsing daily climate data in a xlsx, downloaded from BMKG's Data Online
# 
# If you have spreadsheet file of daily climate data, downloaded from [BMKG Data Online](https://dataonline.bmkg.go.id/data_iklim). then you need to parse it before the data ready to use. 
# 
# Somehow, the file's structure is a bit unconventional for a typical Excel sheet. The values for ID WMO, Nama Stasiun, Lintang, etc. are actually in the header names themselves, not in the cell values under those headers. The climate data itself started at row 9 for the header.
# 
# See below screenshot!
# 
# ![image.png](attachment:image.png)
# 

# ## Extract weather station information from data collection
# 
# Currently I have around 180 xlsx files (1 xlsx for 1 station) in a folder. Let's extract the station information from each file, compile it, and save it as single csv file.

# In[33]:


import pandas as pd
import os
import re
from tqdm import tqdm

# Function to extract data after the colon
def get_data_after_colon(cell_value):
    match = re.search(r':\s+(.*)', str(cell_value))
    if match:
        return match.group(1).strip()
    else:
        return str(cell_value).strip()

# Function to extract only numeric characters
def get_only_numbers(cell_value):
    return ''.join(re.findall(r'\d+', str(cell_value)))

# Directory containing the .xlsx files
folder_path = './input/BMKG/'

# Get a list of all xlsx files in the directory
xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]

all_data = []

# Loop through each file and extract data
for file in tqdm(xlsx_files, desc="Processing files"):
    try:
        # Load the spreadsheet
        xls = pd.ExcelFile(folder_path + file)
        sheet = xls.parse(xls.sheet_names[0])

        # Extract data
        ID_WMO = get_only_numbers(sheet.columns[2])  # From column header
        Station = get_data_after_colon(sheet.iloc[0, 2])
        Lat = float(get_data_after_colon(sheet.iloc[1, 2]))
        Lon = float(get_data_after_colon(sheet.iloc[2, 2]))
        Elevation = float(get_data_after_colon(sheet.iloc[3, 2]))

        all_data.append([ID_WMO, Station, Lon, Lat, Elevation])
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Convert to dataframe
df = pd.DataFrame(all_data, columns=['ID_WMO', 'Station', 'Lon', 'Lat', 'Elevation'])

# Sort by ID_WMO
df = df.sort_values(by=['ID_WMO'])

# Add ID column as index
df = df.reset_index(drop=True)
df['ID'] = df.index + 1

# Reorder columns
df = df[['ID', 'ID_WMO', 'Station', 'Lon', 'Lat', 'Elevation']]

# Save to csv
df.to_csv('./output/idn_cli_weatherstation_location_bmkg.csv', index=False)

# Preview the output
df


# ## Plot the station
# 
# Let's plot the station coordinate into a map, to see the spatial distribution accross the country

# In[50]:


import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

# Load the CSV data
df = pd.read_csv('./output/idn_cli_weatherstation_location_bmkg.csv')

# Initialize a new map
fig, ax = plt.subplots(figsize=(10,15))
m = Basemap(resolution='i', projection='merc', llcrnrlat=-11, urcrnrlat=6, llcrnrlon=95, urcrnrlon=141, ax=ax)
m.drawcoastlines()
m.drawcountries()
m.drawmapboundary(fill_color='white')  # Set sea color to white
m.fillcontinents(color='grey',lake_color='white')  # Set land color to grey

# Plot each weather station on the map
for _, row in df.iterrows():
    x, y = m(row['Lon'], row['Lat'])
    m.plot(x, y, 'ro', markersize=5)  # Change marker to red
    #plt.text(x, y, row['ID_WMO'], fontsize=9)

# Save the map as a PNG
plt.savefig('./output/idn_cli_weatherstation_map_bmkg.png', dpi=300)

# Show the map
plt.show()


# ## Extract climate variables
# 
# Next, extract climate variables (in this case is RR or Rainfall), from each xlsx and compile it as single csv file.
# 
# For this case, I would like to extract daily rainfall from each of station with start date 1 Jun 2000, and arrange in long format with column structure: ID, Date, Julian Date, ID_WMO(1), ID_WMO(2), ..., ID_WMO(n). I'd like to pivot the data so that each unique ID_WMO value forms a column in the resultant DataFrame. The RR values will populate these columns.

# In[36]:


import pandas as pd
import os
from tqdm import tqdm

# Function to convert a date in DD-MM-YYYY format to its Julian date
def to_julian_date(date_string):
    try:
        date_obj = pd.to_datetime(date_string, format='%d-%m-%Y')
        return int(date_obj.strftime('%j'))
    except:
        return None

# Directory containing the .xlsx files
folder_path = './input/BMKG/'

# Get a list of all xlsx files in the directory
xlsx_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]

# Dictionary to store the data
data_dict = {"ID": [], "Date": [], "JD": []}

# Loop through each file and extract data
for file in tqdm(xlsx_files, desc="Processing files"):
    try:
        # Load the spreadsheet
        xls = pd.ExcelFile(folder_path + file)
        sheet = xls.parse(xls.sheet_names[0], skiprows=8)  # Start reading from row 9
        
        # Extract ID_WMO value from column C1
        ID_WMO = ''.join(filter(str.isdigit, xls.parse(xls.sheet_names[0]).columns[2]))
        
        # Check and initialize ID_WMO in data_dict if not already present
        if ID_WMO not in data_dict:
            data_dict[ID_WMO] = [None for _ in range(len(data_dict['ID']))]
        
        # Loop through each row in the sheet starting from the 10th row
        for index, row in sheet.iterrows():
            date_string = row['Tanggal']
            
            # Check if date_string is empty, and if so, break out of the loop for this sheet
            if pd.isna(date_string):
                break
            
            # Check if the date is 01-06-2000 or later
            if pd.to_datetime(date_string, format='%d-%m-%Y') >= pd.to_datetime('01-06-2000', format='%d-%m-%Y'):
                julian_date = to_julian_date(date_string)
                rr_value = row['RR']

                # If date_string is new, append to data_dict
                if date_string not in data_dict["Date"]:
                    data_dict["ID"].append(index + 1)
                    data_dict["Date"].append(date_string)
                    data_dict["JD"].append(julian_date)
                    for key in data_dict:
                        if key not in ["ID", "Date", "JD"]:
                            data_dict[key].append(None)  # Add None for other ID_WMOs
                
                # Update the RR value for the current ID_WMO
                idx = data_dict["Date"].index(date_string)
                data_dict[ID_WMO][idx] = rr_value
                
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Convert data dictionary to dataframe
df = pd.DataFrame(data_dict)

# Save to csv
df.to_csv('./output/idn_cli_weatherstation_data_bmkg.csv', index=False)

# Preview the output
df


# ## Annual summary of the data availability
# 
# Let's calculate number of day per year where the data is NaN, so we can see if this is good enough reason to drop the data or not.

# In[45]:


import pandas as pd

# Load the previously generated CSV file
df = pd.read_csv('./output/idn_cli_weatherstation_data_bmkg.csv')

# Extract the year from the Date column
df['Year'] = pd.to_datetime(df['Date'], format='%d-%m-%Y').dt.year

# Create a dataframe to store annual NaN counts
nan_count_df = df.groupby('Year').apply(lambda x: x.isna().sum()).drop(columns=['ID', 'Date', 'JD', 'Year'])

# Reset the index for the new dataframe
nan_count_df = nan_count_df.reset_index()

# Sort columns by their names (ID_WMO values)
sorted_columns = sorted(nan_count_df.columns[1:], key=lambda x: int(x))  # Convert ID_WMO to integers for sorting
nan_count_df = nan_count_df[['Year'] + sorted_columns]

# Display the result
print(nan_count_df)

# Save the summarized dataframe to CSV
nan_count_df.to_csv('./output/idn_cli_annual_nan_precip_count_summary_bmkg.csv', index=False)


# ## Heat map
# 
# Let's visualise as a heat map.
# 
# Each column represents a year. Each row represents an ID_WMO. Color intensity indicates the count of NaN values.
# 
# This will let you quickly spot patterns across years and stations

# In[43]:


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read the csv file
df = pd.read_csv('./output/idn_cli_annual_nan_precip_count_summary_bmkg.csv')

# Set the 'Year' column as the index
df.set_index('Year', inplace=True)

# Check and drop the 'ID' column if it exists
if 'ID' in df.columns:
    df.drop('ID', axis=1, inplace=True)

# Transpose the DataFrame
df = df.transpose()

# Sort the DataFrame by its index (ID_WMO)
df = df.sort_index()

# Generate the heatmap using only matplotlib
fig, ax = plt.subplots(figsize=(20, 40))

# Plot heatmap using pcolormesh
cax = ax.pcolormesh(df, cmap="viridis", shading="auto")

# Create a colorbar and adjust its height
cbar = fig.colorbar(cax, ax=ax, aspect=50)
cbar.ax.tick_params(labelsize=14)  # Font size for colorbar tick labels
cbar.set_label('Number of NaN Days', size=16)  # Font size for colorbar label

# Set x-axis labels to be the columns of df (i.e., years)
ax.set_xticks(np.arange(len(df.columns)) + 0.5)
ax.set_xticklabels(df.columns, rotation=45, ha='right', fontsize=14)  # Font size for x-axis labels

# Set y-axis labels to be the index of df (i.e., the ID_WMO values)
ax.set_yticks(np.arange(len(df.index)) + 0.5)
ax.set_yticklabels(df.index, fontsize=14)  # Font size for y-axis labels

# Set title and x/y axis labels with specified font sizes
ax.set_title("Number of Days with NaN Values per Year", fontsize=20)
ax.set_xlabel("Year", fontsize=16)
ax.set_ylabel("ID_WMO", fontsize=16)

# Save the map as a PNG
plt.savefig('./output/idn_cli_weatherstation_heatmap_nancount_bmkg.png', dpi=300)

plt.tight_layout()
plt.show()


# In[ ]: