#!/usr/bin/env python # coding: utf-8 # # Feathr Fraud Detection Sample # # This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. The main focus of this notebook is to depict: # * How a feature designer can define heterogenious features from different data sources (user account data and transaction data) with different keys by using Feathr, and # * How a feature consumer can extract features using multiple `FeatureQuery`. # # The sample fraud transaction datasets that are used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection. # # The outline of the notebook is as follows: # 1. Setup Feathr environment # 2. Initialize Feathr client # 3. Define features # 4. Build features and extract offline features # 5. Build a fraud detection model # 6. Materialize features # ## 1. Setup Feathr Environment # # ### Deploy Necessary Azure Resources to run Feathr Feature Store # # Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html # # ### Access to Resources # To run the cells below, you need additional permissions for your managed identity to access the keyvault and the Storage Account. You may run the following lines of command in the Cloud Shell in order to grant yourself the access. # # ``` # userId= # resource_prefix= # synapse_workspace_name="${resource_prefix}syws" # keyvault_name="${resource_prefix}kv" # objectId=$(az ad user show --id $userId --query id -o tsv) # az keyvault update --name $keyvault_name --enable-rbac-authorization false # az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId # az role assignment create --assignee $userId --role "Storage Blob Data Contributor" # az synapse role assignment create --workspace-name $synapse_workspace_name --role "Synapse Contributor" --assignee $userId # ``` # # ### Install Python Packages # # Uncomment following cell and run it to install Feathr python package and necessary dependencies. # In[ ]: # Install feathr from the latest codes in the repo. You may use `pip install feathr[notebook]` as well. # %pip install "git+https://github.com/feathr-ai/feathr.git#subdirectory=feathr_project&egg=feathr[notebook]" # ## 2. Initialize Feathr Client # In[ ]: from datetime import datetime, timedelta import os from pathlib import Path import numpy as np import pandas as pd import feathr from feathr import ( FeathrClient, STRING, BOOLEAN, FLOAT, INT32, ValueType, Feature, DerivedFeature, FeatureAnchor, BackfillTime, MaterializationSettings, FeatureQuery, ObservationSettings, RedisSink, HdfsSource, WindowAggTransformation, TypedKey, ) from feathr.datasets.constants import ( FRAUD_DETECTION_ACCOUNT_INFO_URL, FRAUD_DETECTION_FRAUD_TRANSACTIONS_URL, FRAUD_DETECTION_UNTAGGED_TRANSACTIONS_URL, ) from feathr.datasets.utils import maybe_download from feathr.utils.config import generate_config from feathr.utils.job_utils import get_result_df from feathr.utils.platform import is_databricks print(f"Feathr version: {feathr.__version__}") # In[ ]: RESOURCE_PREFIX = "" # TODO fill the value used to deploy the resources via ARM template PROJECT_NAME = "fraud_detection" # Currently support: 'azure_synapse', 'databricks', and 'local' SPARK_CLUSTER = "local" # TODO fill values to use databricks cluster: DATABRICKS_CLUSTER_ID = None # Set Databricks cluster id to use an existing cluster if is_databricks(): # If this notebook is running on Databricks, its context can be used to retrieve token and instance URL ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext() DATABRICKS_WORKSPACE_TOKEN_VALUE = ctx.apiToken().get() SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = f"https://{ctx.tags().get('browserHostName').get()}" else: DATABRICKS_WORKSPACE_TOKEN_VALUE = None # Set Databricks workspace token to use databricks SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL = None # Set Databricks workspace url to use databricks # TODO fill values to use Azure Synapse cluster: AZURE_SYNAPSE_SPARK_POOL = None # Set Azure Synapse Spark pool name AZURE_SYNAPSE_URL = None # Set Azure Synapse workspace url to use Azure Synapse ADLS_KEY = None # Set Azure Data Lake Storage key to use Azure Synapse USE_CLI_AUTH = False # Set to True to use CLI authentication # An existing Feathr config file path. If None, we'll generate a new config based on the constants in this cell. FEATHR_CONFIG_PATH = None # (For the notebook test pipeline) If true, use ScrapBook package to collect the results. SCRAP_RESULTS = False # In[ ]: if SPARK_CLUSTER == "azure_synapse" and not os.environ.get("ADLS_KEY"): os.environ["ADLS_KEY"] = ADLS_KEY elif SPARK_CLUSTER == "databricks" and not os.environ.get("DATABRICKS_WORKSPACE_TOKEN_VALUE"): os.environ["DATABRICKS_WORKSPACE_TOKEN_VALUE"] = DATABRICKS_WORKSPACE_TOKEN_VALUE # In[ ]: # Get an authentication credential to access Azure resources and register features if USE_CLI_AUTH: # Use AZ CLI interactive browser authentication get_ipython().system('az login --use-device-code') from azure.identity import AzureCliCredential credential = AzureCliCredential(additionally_allowed_tenants=['*'],) elif "AZURE_TENANT_ID" in os.environ and "AZURE_CLIENT_ID" in os.environ and "AZURE_CLIENT_SECRET" in os.environ: # Use Environment variable secret from azure.identity import EnvironmentCredential credential = EnvironmentCredential() else: # Try to use the default credential from azure.identity import DefaultAzureCredential credential = DefaultAzureCredential( exclude_interactive_browser_credential=False, additionally_allowed_tenants=['*'], ) # In[ ]: # Redis password if 'REDIS_PASSWORD' not in os.environ: from azure.keyvault.secrets import SecretClient vault_url = f"https://{RESOURCE_PREFIX}kv.vault.azure.net" secret_client = SecretClient(vault_url=vault_url, credential=credential) retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value os.environ['REDIS_PASSWORD'] = retrieved_secret.split(",")[1].split("password=", 1)[1] # ### Generate a config file # In[ ]: if FEATHR_CONFIG_PATH: config_path = FEATHR_CONFIG_PATH else: config_path = generate_config( resource_prefix=RESOURCE_PREFIX, project_name=PROJECT_NAME, spark_config__spark_cluster=SPARK_CLUSTER, spark_config__azure_synapse__dev_url=AZURE_SYNAPSE_URL, spark_config__azure_synapse__pool_name=AZURE_SYNAPSE_SPARK_POOL, spark_config__databricks__workspace_instance_url=SPARK_CONFIG__DATABRICKS__WORKSPACE_INSTANCE_URL, databricks_cluster_id=DATABRICKS_CLUSTER_ID, ) with open(config_path, 'r') as f: print(f.read()) # ### Initialize Feathr client # In[ ]: client = FeathrClient(config_path=config_path, credential=credential) # ## 3. Define Features # # ### Prepare datasets # # We prepare the fraud detection dataset as follows: # # 1. Download Account info data, fraud transactions data, and untagged transactions data. # 2. Tag transaction data based on the fraud transactions data. # 1. Aggregate the Fraud table on the account level, creating a start and end datetime. # 2. Join this data with the untagged data. # 3. Tag the data: `is_fraud = 0` for non fraud, `1` for fraud. # 3. Upload data files to cloud so that the Feathr's target cluster can consume. # # To learn more about the fraud detection scenario as well as the dataset source we use and the method we tag the transactions, please see [here](https://microsoft.github.io/r-server-fraud-detection/data-scientist.html). # In[ ]: # Use dbfs if the notebook is running on Databricks if is_databricks(): WORKING_DIR = f"/dbfs/{PROJECT_NAME}" else: WORKING_DIR = PROJECT_NAME # In[ ]: # Download datasets account_info_file_path = f"{WORKING_DIR}/account_info.csv" fraud_transactions_file_path = f"{WORKING_DIR}/fraud_transactions.csv" obs_transactions_file_path = f"{WORKING_DIR}/obs_transactions.csv" maybe_download( src_url=FRAUD_DETECTION_ACCOUNT_INFO_URL, dst_filepath=account_info_file_path, ) maybe_download( src_url=FRAUD_DETECTION_FRAUD_TRANSACTIONS_URL, dst_filepath=fraud_transactions_file_path, ) maybe_download( src_url=FRAUD_DETECTION_UNTAGGED_TRANSACTIONS_URL, dst_filepath=obs_transactions_file_path, ) # In[ ]: # Load datasets fraud_df = pd.read_csv(fraud_transactions_file_path) obs_df = pd.read_csv(obs_transactions_file_path) # In[ ]: # Combine transactionDate and transactionTime into one column. E.g. "20130903", "013641" -> "20130903 013641" fraud_df["timestamp"] = fraud_df["transactionDate"].astype(str) + " " + fraud_df["transactionTime"].astype(str).str.zfill(6) obs_df["timestamp"] = obs_df["transactionDate"].astype(str) + " " + obs_df["transactionTime"].astype(str).str.zfill(6) # In this step, we compute the timestamp range that the frauds were happened by referencing the transaction-level fraud data. # We create the labels `is_fraud` to the untagged transaction data based on that. # In[ ]: # For each user in the fraud transaction data, get the timestamp range that the fraud transactions were happened. fraud_labels_df = fraud_df.groupby("accountID").agg({"timestamp": ['min', 'max']}) fraud_labels_df.columns = ["_".join(col) for col in fraud_labels_df.columns.values] fraud_labels_df.head() # In[ ]: # Combine fraud and untagged transaction data to generate the tagged transaction data. transactions_df = pd.concat([fraud_df, obs_df], ignore_index=True).merge( fraud_labels_df, on="accountID", how="outer", ) # Data cleaning transactions_df.dropna( subset=[ "accountID", "transactionID", "transactionAmount", "localHour", "timestamp", ], inplace=True, ) transactions_df.sort_values("timestamp", inplace=True) transactions_df.drop_duplicates(inplace=True) # is_fraud = 0 if the transaction is not fraud. Otherwise (if it is a fraud), is_fraud = 1. transactions_df["is_fraud"] = np.logical_and( transactions_df["timestamp_min"] <= transactions_df["timestamp"], transactions_df["timestamp"] <= transactions_df["timestamp_max"], ).astype(int) transactions_df.head() # In[ ]: transactions_df["is_fraud"].value_counts() # In[ ]: # Save the tagged transaction data into file transactions_file_path = f"{WORKING_DIR}/transactions.csv" transactions_df.to_csv(transactions_file_path, index=False) # In[ ]: # Upload files to cloud if needed if client.spark_runtime == "local": # In local mode, we can use the same data path as the source. # If the notebook is running on databricks, DATA_FILE_PATH should be already a dbfs path. account_info_source_path = account_info_file_path transactions_source_path = transactions_file_path elif client.spark_runtime == "databricks" and is_databricks(): # If the notebook is running on databricks, we can use the same data path as the source. account_info_source_path = account_info_file_path.replace("/dbfs", "dbfs:") transactions_source_path = transactions_file_path.replace("/dbfs", "dbfs:") else: # Otherwise, upload the local file to the cloud storage (either dbfs or adls). account_info_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(account_info_file_path) transactions_source_path = client.feathr_spark_launcher.upload_or_get_cloud_path(transactions_file_path) # Now, we can define following features: # - Account features: Account-level features that will be joined to observation data on accountID # - Transaction features: The features that will be joined to observation data on transactionID # - Transaction aggregated features: The features aggregated by accountID # - Derived features: The features derived from other features # # Some important concepts include `HdfsSource`, `TypedKey`, `Feature`, `FeatureAnchor`, and `DerivedFeature`. Please refer to feathr [documents](https://feathr.readthedocs.io/en/latest/feathr.html) to learn more about the details. # # ### Define account features # # Let's first check the account data. # In[ ]: # Check account data pd.read_csv(account_info_file_path).head() # Here, we use `accountCountry`, `isUserRegistered`, `numPaymentRejects1dPerUser`, and `accountAge` as the account features. # In[ ]: def account_preprocessing(df): """Drop rows with missing values in the account info dataset.""" return df.select( "accountID", "accountCountry", "isUserRegistered", "numPaymentRejects1dPerUser", "accountAge", ).dropna(subset=["accountID"]) account_info_source = HdfsSource( name="account_data", path=account_info_source_path, preprocessing=account_preprocessing, ) # In[ ]: # Account features will be joined to observation data on accountID account_id = TypedKey( key_column="accountID", key_column_type=ValueType.STRING, description="account id", ) account_features = [ Feature( name="account_country_code", key=account_id, feature_type=STRING, transform="accountCountry", ), Feature( name="is_user_registered", key=account_id, feature_type=BOOLEAN, transform="isUserRegistered==TRUE", ), Feature( name="num_payment_rejects_1d_per_user", key=account_id, feature_type=INT32, transform="numPaymentRejects1dPerUser", ), Feature( name="account_age", key=account_id, feature_type=INT32, transform="accountAge", ), ] account_anchor = FeatureAnchor( name="account_features", source=account_info_source, features=account_features, ) # ### Define transaction features # # We already checked the transaction dataset when we tagged the fraud label `is_fraud`. So, let's jump to defining features. # In[ ]: def transaction_preprocessing(df): """Preprocess the transaction data.""" import pyspark.sql.functions as F return df.withColumn("ipCountryCode", F.upper("ipCountryCode")) transactions_source = HdfsSource( name="transaction_data", path=transactions_source_path, event_timestamp_column="timestamp", timestamp_format="yyyyMMdd HHmmss", preprocessing=transaction_preprocessing, ) # In[ ]: # Transaction features will be joined to observation data on transactionID transaction_id = TypedKey( key_column="transactionID", key_column_type=ValueType.STRING, description="transaction id", ) transaction_amount = Feature( name="transaction_amount", key=transaction_id, feature_type=FLOAT, transform="transactionAmount", ) transaction_features = [ transaction_amount, Feature( name="transaction_country_code", key=transaction_id, feature_type=STRING, transform="ipCountryCode", ), Feature( name="transaction_time", key=transaction_id, feature_type=FLOAT, transform="localHour", # Local time of the transaction ), Feature( name="is_proxy_ip", key=transaction_id, feature_type=STRING, # [nan, True, False] transform="isProxyIP", ), Feature( name="cvv_verify_result", key=transaction_id, feature_type=STRING, # [nan, 'M', 'P', 'N', 'X', 'U', 'S', 'Y'] transform="cvvVerifyResult", ), ] transaction_feature_anchor = FeatureAnchor( name="transaction_features", source=transactions_source, features=transaction_features, ) # ### Define transaction aggregated-features # In[ ]: # average amount of transaction in that week avg_transaction_amount = Feature( name="avg_transaction_amount", key=account_id, feature_type=FLOAT, transform=WindowAggTransformation( agg_expr="cast_float(transactionAmount)", agg_func="AVG", window="7d" ), ) agg_features = [ avg_transaction_amount, # number of transaction that took place in a day Feature( name="num_transaction_count_in_day", key=account_id, feature_type=INT32, transform=WindowAggTransformation( agg_expr="transactionID", agg_func="COUNT", window="1d" ), ), # number of transaction that took place in the past week Feature( name="num_transaction_count_in_week", key=account_id, feature_type=INT32, transform=WindowAggTransformation( agg_expr="transactionID", agg_func="COUNT", window="7d" ), ), # amount of transaction that took place in a day Feature( name="total_transaction_amount_in_day", key=account_id, feature_type=FLOAT, transform=WindowAggTransformation( agg_expr="cast_float(transactionAmount)", agg_func="SUM", window="1d" ), ), # average time of transaction in the past week Feature( name="avg_transaction_time_in_week", key=account_id, feature_type=FLOAT, transform=WindowAggTransformation( agg_expr="cast_float(localHour)", agg_func="AVG", window="7d" ), ), ] agg_anchor = FeatureAnchor( name="transaction_agg_features", source=transactions_source, features=agg_features, ) # ### Define derived features # In[ ]: derived_features = [ DerivedFeature( name="diff_between_current_and_avg_amount", key=[transaction_id, account_id], feature_type=FLOAT, input_features=[transaction_amount, avg_transaction_amount], transform="transaction_amount - avg_transaction_amount", ), ] # ## 4. Build Features and Extract Offline Features # # Now, let's build the features. # In[ ]: client.build_features( anchor_list=[ account_anchor, transaction_feature_anchor, agg_anchor, ], derived_feature_list=derived_features, ) # In[ ]: account_feature_names = [feat.name for feat in account_features] + [feat.name for feat in agg_features] transactions_feature_names = [feat.name for feat in transaction_features] derived_feature_names = [feat.name for feat in derived_features] # To extract the offline feature values from the features that have different keys, we use multiple `FeatureQuery` objects. # In[ ]: account_feature_query = FeatureQuery( feature_list=account_feature_names, key=account_id, ) transactions_feature_query = FeatureQuery( feature_list=transactions_feature_names, key=transaction_id, ) derived_feature_query = FeatureQuery( feature_list=derived_feature_names, key=[transaction_id, account_id], ) settings = ObservationSettings( observation_path=transactions_source_path, event_timestamp_column="timestamp", timestamp_format="yyyyMMdd HHmmss", ) client.get_offline_features( observation_settings=settings, feature_query=[account_feature_query, transactions_feature_query, derived_feature_query], output_path=transactions_source_path.rpartition("/")[0] + f"/fraud_transactions_features.avro", ) client.wait_job_to_finish(timeout_sec=5000) # In[ ]: df = get_result_df(client)[ account_feature_names + transactions_feature_names + derived_feature_names + ["is_fraud", "timestamp"] ] df.head(5) # ## 5. Build a Fraud Detection Model # # We use [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) to build a fraud detection model. # In[ ]: from plotly.subplots import make_subplots import plotly.graph_objects as go import plotly.express as px from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import ( confusion_matrix, f1_score, precision_score, recall_score, PrecisionRecallDisplay, ) from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder # ### Understand the dataset # In[ ]: df.describe().T # In[ ]: df.nunique() # In[ ]: # plot only sub-samples for simplicity NUM_SAMPLES_TO_PLOT = 5000 fig = px.scatter_matrix( df.sample(n=NUM_SAMPLES_TO_PLOT, random_state=42), dimensions=df.columns[:-2], # exclude the label and timestamp color="is_fraud", labels={col:col.replace('_', ' ') for col in df.columns}, # remove underscore ) fig.update_traces(diagonal_visible=False, showupperhalf=False, marker_size=3, marker_opacity=0.5) fig.update_layout( width=2000, height=2000, title={"text": "Scatter matrix for transaction dataset", "font_size": 20}, font_size=6, ) fig.show() # ### Split training and validation sets # In[ ]: n_train = int(len(df) * 0.7) train_df = df.iloc[:n_train] test_df = df.iloc[n_train:] print(f"""Training set: {train_df["is_fraud"].value_counts()} Validation set: {test_df["is_fraud"].value_counts()} """) # In[ ]: # Check the time range of the training and test set doesn't overlap train_df["timestamp"].max(), test_df["timestamp"].min() # ### Train and test a machine learning model # In[ ]: # Get labels as integers y_train = train_df["is_fraud"].astype(int).to_numpy() y_test = test_df["is_fraud"].astype(int).to_numpy() # In[ ]: # We convert categorical features into integer values by using one-hot-encoding and ordinal-encoding categorical_feature_names = [ "account_country_code", "transaction_country_code", "cvv_verify_result", ] ordinal_feature_names = [ "is_user_registered", "is_proxy_ip", ] # In[ ]: one_hot_encoder = OneHotEncoder(sparse_output=False).fit(df[categorical_feature_names]) ordinal_encoder = OrdinalEncoder().fit(df[ordinal_feature_names]) # In[ ]: ordinal_encoder.categories_ # In[ ]: one_hot_encoder.categories_ # In[ ]: X_train = np.concatenate( ( one_hot_encoder.transform(train_df[categorical_feature_names]), ordinal_encoder.transform(train_df[ordinal_feature_names]), train_df.drop(categorical_feature_names + ordinal_feature_names + ["is_fraud", "timestamp"], axis="columns").fillna(0).to_numpy(), ), axis=1, ) X_test = np.concatenate( ( one_hot_encoder.transform(test_df[categorical_feature_names]), ordinal_encoder.transform(test_df[ordinal_feature_names]), test_df.drop(categorical_feature_names + ordinal_feature_names + ["is_fraud", "timestamp"], axis="columns").fillna(0).to_numpy(), ), axis=1, ) # In[ ]: clf = RandomForestClassifier( n_estimators=50, random_state=42, ).fit(X_train, y_train) # In[ ]: clf.score(X_test, y_test) # In[ ]: y_pred = clf.predict(X_test) y_pred # In[ ]: y_prob = clf.predict_proba(X_test) y_prob # To measure the performance, we use recall, precision and F1 score that handle imbalanced data better. # In[ ]: display = PrecisionRecallDisplay.from_predictions( y_test, y_prob[:, 1], name="RandomForestClassifier" ) _ = display.ax_.set_title("Fraud Detection Precision-Recall Curve") # In[ ]: precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print(f"""Precision: {precision}, Recall: {recall}, F1: {f1}""") # In[ ]: confusion_matrix(y_test, y_pred) # ### Feature importance # In[ ]: numeric_feature_names = [name for name in train_df.columns if name not in set(categorical_feature_names + ordinal_feature_names + ["is_fraud", "timestamp"])] numeric_feature_names # In[ ]: # the order of features is [categorical features, ordinal features, numeric features] importances = clf.feature_importances_[-len(numeric_feature_names):] std = np.std([tree.feature_importances_[-len(numeric_feature_names):] for tree in clf.estimators_], axis=0) fig = px.bar( pd.DataFrame([numeric_feature_names, importances, std], index=["Numeric features", "importances", "std"]).T, y="Numeric features", x="importances", error_x="std", orientation="h", title="Importance of the numeric features", ) fig.update_layout(showlegend=False, width=1000) fig.update_xaxes(title_text="Mean decrease in impurity", range=[0, 0.5]) fig.update_yaxes(title_text="Numeric features") fig.show() # In[ ]: feature_names = categorical_feature_names + ordinal_feature_names categories = one_hot_encoder.categories_ + ordinal_encoder.categories_ start_i = 0 n_rows = len(feature_names) fig = make_subplots( rows=n_rows, cols=1, subplot_titles=[name.replace("_", " ") for name in feature_names], x_title="Mean decrease in impurity", ) for i in range(n_rows): category = categories[i] end_i = start_i + len(category) fig.add_trace( go.Bar( x=clf.feature_importances_[start_i:end_i], y=category, width=0.2, error_x=dict( type="data", array=np.std([tree.feature_importances_[start_i:end_i] for tree in clf.estimators_], axis=0), ), orientation="h", ), row=i+1, col=1, ) start_i = end_i fig.update_layout(title="Importance of the categorical features", showlegend=False, width=1000, height=1000) fig.update_xaxes(range=[0, 0.5]) fig.show() # ## Materialize Features in Redis # # Now, we materialize features to `RedisSink` so that we can retrieve online features. # In[ ]: ACCOUNT_FEATURE_TABLE_NAME = "fraudDetectionAccountFeatures" backfill_time = BackfillTime( start=datetime(2013, 8, 4), end=datetime(2013, 8, 4), step=timedelta(days=1), ) # In[ ]: client.materialize_features( MaterializationSettings( ACCOUNT_FEATURE_TABLE_NAME, backfill_time=backfill_time, sinks=[RedisSink(table_name=ACCOUNT_FEATURE_TABLE_NAME)], feature_names=account_feature_names[1:], ), allow_materialize_non_agg_feature=True, ) client.wait_job_to_finish(timeout_sec=5000) # In[ ]: materialized_feature_values = client.get_online_features( ACCOUNT_FEATURE_TABLE_NAME, key="A1055520452832600", feature_names=account_feature_names[1:], ) materialized_feature_values # Scrap results for unit test # In[ ]: if SCRAP_RESULTS: import scrapbook as sb sb.glue("materialized_feature_values", materialized_feature_values) sb.glue("precision", precision) sb.glue("recall", recall) sb.glue("f1", f1) # ### Cleanup # In[ ]: # Cleaning up the output files. CAUTION: this maybe dangerous if you "reused" the project name. import shutil shutil.rmtree(WORKING_DIR, ignore_errors=False)