Berdasarkan isu #88: request: ambil dataset hujan harian
Referensi isu:
hidrokit.contrib.taruma.hk79
#79. (lihat notebook/manual). request: ambil dataset hujan jam-jaman dari excelDeskripsi permasalahan:
pandas.DataFrame
, dengan baris menunjukkan observasi/kejadian dan kolom menunjukkan stasiun.Change Log:
read_workbook(...)
untuk membaca berkas tanpa perlu mengetahui nama sheet dan membaca seluruh sheet kecuali yang sheet berawalan ignore_str='_'
.as_df=False
akan memperoleh pesan error karena luaran sebelum versi 0.4.0 berupa list
, sedangkan untuk versi 0.4.0 luaran berupa dictionary
.try:
import hidrokit
except ModuleNotFoundError:
!pip install hidrokit -q
import hidrokit
print(f'hidrokit version: {hidrokit.__version__}')
hidrokit version: 0.4.1
# Unduh dataset
!wget -O sample.xlsx "https://taruma.github.io/assets/hidrokit_dataset/hidrokit_daily_template.xlsx" -q
FILE = 'sample.xlsx'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import functools
def deprecated(new_func_name):
"""
Decorator to mark a function as deprecated.
Parameters:
- new_func_name (str): The name of the new function that should be used instead.
Returns:
- wrapper (function): The decorated function.
Example:
@deprecated("new_function")
def old_function():
pass
The above example will generate a warning when `old_function` is called,
suggesting to use `new_function` instead.
"""
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
warnings.warn(
f"{func.__name__} is deprecated, use {new_func_name} instead",
DeprecationWarning,
)
return func(*args, **kwargs)
return wrapper
return decorator
from calendar import isleap
from typing import List, Union
import pandas as pd
import numpy as np
def _melt_to_year_vector(dataframe: pd.DataFrame, year: int) -> np.ndarray:
"""
Melt a pandas DataFrame to a 1D numpy array for a specific year.
Args:
dataframe: The pandas DataFrame to melt.
year: The year to consider when melting the DataFrame.
Returns:
A 1D numpy array representing the melted DataFrame for the specified year.
"""
if not isinstance(dataframe, pd.DataFrame):
raise ValueError("dataframe must be a pandas DataFrame.")
if not isinstance(year, int):
raise ValueError("year must be an integer.")
_drop = [59, 60, 61, 123, 185, 278, 340]
_drop_leap = [60, 61, 123, 185, 278, 340]
melted_data = dataframe.melt().drop("variable", axis=1)
if isleap(year):
return melted_data["value"].drop(_drop_leap).values
return melted_data["value"].drop(_drop).values
def _generate_date_range_for_year(year):
"""Return DateTimeIndex object for one year.
Args:
year (int): The year for which to generate the date range.
Returns:
pd.DatetimeIndex: A DateTimeIndex object representing the date range for the specified year.
"""
start_date = pd.Timestamp(year, 1, 1)
end_date = pd.Timestamp(year + 1, 1, 1)
return pd.date_range(start_date, end_date, inclusive="left")
def _create_yearly_dataframe(
dataframe: pd.DataFrame, year: int, station_name: str
) -> pd.DataFrame:
"""
Create a DataFrame for one year.
Parameters:
df (DataFrame): The original DataFrame.
year (int): The year to filter the DataFrame.
station_name (str): The name of the station.
Returns:
DataFrame: A new DataFrame with data for the specified year and station.
"""
return pd.DataFrame(
data=_melt_to_year_vector(dataframe, year),
index=_generate_date_range_for_year(year),
columns=[station_name],
)
def _extract_data_from_sheet(
dataframe: pd.DataFrame, station: str, return_as_dataframe: bool = True
) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""
Extracts data from a sheet in the given dataframe and returns it
as a dataframe or a list of dataframes.
Parameters:
dataframe (pd.DataFrame): The dataframe containing the sheet data.
station (str): The name of the station.
return_as_dataframe (bool, optional): Whether to return the data
as a single dataframe or a list of dataframes. Defaults to True.
Returns:
Union[pd.DataFrame, List[pd.DataFrame]]:
The extracted data as a dataframe or a list of dataframes.
"""
total_years = int(dataframe.iloc[0, 1])
yearly_dataframes = []
for i in range(2, total_years * 33, 33):
current_year = int(dataframe.iloc[i, 1])
yearly_data = dataframe.iloc[i : i + 31, 4:16]
yearly_dataframe = _create_yearly_dataframe(yearly_data, current_year, station)
yearly_dataframes.append(yearly_dataframe)
if return_as_dataframe:
return pd.concat(yearly_dataframes, sort=True)
return yearly_dataframes
def read_hidrokit_workbook(
file_path, station_names=None, ignore_prefix="_", return_as_dataframe=True
):
"""
Read data from an hidrokit Excel workbook.
Parameters:
file_path (str): The path to the Excel workbook file.
station_names (list or str, optional): The names of the sheets to read.
If not provided, all sheets will be read.
ignore_prefix (str, optional): The prefix used to ignore sheets. Default is '_'.
return_as_dataframe (bool, optional): Whether to return the data as a pandas DataFrame.
Default is True.
Returns:
dict or pandas.DataFrame: A dictionary containing the data from each sheet,
with sheet names as keys. If `return_as_dataframe` is True,
a pandas DataFrame is returned instead.
"""
excel_file = pd.ExcelFile(file_path)
station_data = {}
all_sheet_names = excel_file.sheet_names
if station_names is None:
station_names = []
for sheet_name in all_sheet_names:
if not sheet_name.startswith(ignore_prefix):
station_names.append(sheet_name)
else:
station_names = (
[station_names] if isinstance(station_names, str) else station_names
)
for station_name in station_names:
dataframe = excel_file.parse(sheet_name=station_name, header=None)
station_data[station_name] = _extract_data_from_sheet(dataframe, station_name)
if return_as_dataframe:
return pd.concat(station_data.values(), sort=True, axis=1)
return station_data
## Backward Compatibility (0.3.x - 0.4.x)
@deprecated("_melt_to_year_vector")
def _melt_to_array(*args, **kwargs):
return _melt_to_year_vector(*args, **kwargs)
@deprecated("_generate_date_range_for_year")
def _index_daily(*args, **kwargs):
return _generate_date_range_for_year(*args, **kwargs)
@deprecated("_create_yearly_dataframe")
def _yearly_df(*args, **kwargs):
return _create_yearly_dataframe(*args, **kwargs)
@deprecated("_extract_data_from_sheet")
def _data_from_sheet(*args, **kwargs):
return _extract_data_from_sheet(*args, **kwargs)
@deprecated("read_hidrokit_workbook")
def read_workbook(io, stations=None, ignore_str="_", as_df=True):
"""
Read data from an hidrokit Excel workbook.
Parameters:
io (str): The path to the Excel workbook file.
stations (list or str, optional): The names of the sheets to read.
If not provided, all sheets will be read.
ignore_str (str, optional): The prefix used to ignore sheets. Default is '_'.
as_df (bool, optional): Whether to return the data as a pandas DataFrame.
Default is True.
Returns:
dict or pandas.DataFrame: A dictionary containing the data from each sheet,
with sheet names as keys. If `as_df` is True,
a pandas DataFrame is returned instead.
"""
return read_hidrokit_workbook(
file_path=io,
station_names=stations,
ignore_prefix=ignore_str,
return_as_dataframe=as_df,
)
np.float = float # temporary fix 0.4.x
from hidrokit.contrib.taruma import hk79
import numpy as np
# Ambil informasi excel menggunakan modul .hk79
data_info = hk79._get_info(FILE, config_sheet='_INFO')
print(':: INFORMASI PADA BERKAS')
print(data_info)
:: INFORMASI PADA BERKAS {'key': 'VALUE', 'n_stations': 2, 'stations': 'AURENE, TYBALT', 'source': 'RATA SUM', 'station_1_years': '2002, 2003, 2004', 'station_2_years': '2007, 2008'}
stations = data_info['stations'].replace(' ', '').split(',')
print('nama stasiun dalam berkas:', stations)
nama stasiun dalam berkas: ['AURENE', 'TYBALT']
aurene = read_workbook(FILE, ['AURENE'])
aurene.info()
aurene.head()
<ipython-input-16-23e254b88fe0>:27: DeprecationWarning: read_workbook is deprecated, use read_hidrokit_workbook instead warnings.warn(
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1096 entries, 2002-01-01 to 2004-12-31 Freq: D Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AURENE 1096 non-null object dtypes: object(1) memory usage: 17.1+ KB
AURENE | |
---|---|
2002-01-01 | 17.08 |
2002-01-02 | 16.28 |
2002-01-03 | 20.32 |
2002-01-04 | 18.34 |
2002-01-05 | 13.16 |
dataset = read_workbook(FILE, ['TYBALT', 'AURENE'])
dataset.sort_index()
<ipython-input-16-23e254b88fe0>:27: DeprecationWarning: read_workbook is deprecated, use read_hidrokit_workbook instead warnings.warn(
TYBALT | AURENE | |
---|---|---|
2002-01-01 | NaN | 17.08 |
2002-01-02 | NaN | 16.28 |
2002-01-03 | NaN | 20.32 |
2002-01-04 | NaN | 18.34 |
2002-01-05 | NaN | 13.16 |
... | ... | ... |
2008-12-27 | 134.83 | NaN |
2008-12-28 | 81.88 | NaN |
2008-12-29 | 20.14 | NaN |
2008-12-30 | 208.54 | NaN |
2008-12-31 | 208.14 | NaN |
1827 rows × 2 columns
read_hidrokit_workbook(io, ...)
¶Function: read_hidrokit_workbook(file_path, station_names=None, ignore_prefix="_", return_as_dataframe=True)
Read data from an Excel workbook.
Parameters:
file_path (str): The path to the Excel workbook file.
station_names (list or str, optional): The names of the sheets to read.
If not provided, all sheets will be read.
ignore_prefix (str, optional): The prefix used to ignore sheets. Default is '_'.
return_as_dataframe (bool, optional): Whether to return the data as a pandas DataFrame.
Default is True.
Returns:
dict or pandas.DataFrame: A dictionary containing the data from each sheet,
with sheet names as keys. If `return_as_dataframe` is True,
a pandas DataFrame is returned instead.
Catatan: Di hidrokit 0.5.x, fungsi read_workbook(...)
diperbarui menjadi read_hidrokit_workbook(...)
agar lebih jelas dan mudah dibaca.
read_hidrokit_workbook(FILE)
AURENE | TYBALT | |
---|---|---|
2002-01-01 | 17.08 | NaN |
2002-01-02 | 16.28 | NaN |
2002-01-03 | 20.32 | NaN |
2002-01-04 | 18.34 | NaN |
2002-01-05 | 13.16 | NaN |
... | ... | ... |
2008-12-27 | NaN | 134.83 |
2008-12-28 | NaN | 81.88 |
2008-12-29 | NaN | 20.14 |
2008-12-30 | NaN | 208.54 |
2008-12-31 | NaN | 208.14 |
1827 rows × 2 columns
read_hidrokit_workbook(FILE, return_as_dataframe=False)
{'AURENE': AURENE 2002-01-01 17.08 2002-01-02 16.28 2002-01-03 20.32 2002-01-04 18.34 2002-01-05 13.16 ... ... 2004-12-27 454.75 2004-12-28 69.75 2004-12-29 189.39 2004-12-30 166.39 2004-12-31 133.99 [1096 rows x 1 columns], 'TYBALT': TYBALT 2007-01-01 201.92 2007-01-02 255.38 2007-01-03 130.84 2007-01-04 17.96 2007-01-05 19.12 ... ... 2008-12-27 134.83 2008-12-28 81.88 2008-12-29 20.14 2008-12-30 208.54 2008-12-31 208.14 [731 rows x 1 columns]}
- 20240413 - 2.0.0 / 0.5.0 - Refactor hk88, new functions name.
- 20220402 - 1.1.1 - Change 0.3.7 to 0.4.0
- 20220318 - 1.1.0 - Improve read_workbook(...) function Issue #162
- 20191217 - 1.0.1 - Fix read_workbook() issue#95
- 20191213 - 1.0.0 - Initial
Source code in this notebook is licensed under a MIT License. Data in this notebook is licensed under a Creative Common Attribution 4.0 International.