#!/usr/bin/env python # coding: utf-8 # [![image](https://raw.githubusercontent.com/visual-layer/visuallayer/main/imgs/vl_horizontal_logo.png)](https://www.visual-layer.com) # # Use fastdup to cluster heatmaps of user GPS movements # # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/visual-layer/fastdup/blob/main/examples/heatmaps.ipynb) # [![Open in Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/visual-layer/fastdup/blob/main/examples/heatmaps.ipynb) # # In this tutorial we learn how to cluster heatmaps, look at their labels and find anomalies in heatmaps. As an example heatmaps are generate by GPS coordinates of taxi data in NY. However there are multiple other use cases which produce heatmaps where fastdup can be used. # # # In[29]: #Install those two packages #pip install folium get_ipython().run_line_magic('pip', 'install selenium') # In[30]: import folium from folium.plugins import HeatMap import random # Download Kaggle New York's taxi fare prediction test.csv dataset from https://www.kaggle.com/competitions/new-york-city-taxi-fare-prediction/data # In[32]: import pandas as pd df = pd.read_csv('test.csv') # In[33]: df.head() # In[8]: len(df) # In[9]: pwd=get_ipython().getoutput('pwd') pwd=pwd[0] print(pwd) # ## Generate similated data of two types: either real taxi GPS coordinates or random noise. # In[22]: import folium from folium.plugins import HeatMap import random from tqdm import tqdm import numpy as np # Generate some random GPS coordinates for people moving around coordinates = df[[ 'pickup_longitude','pickup_latitude']].to_numpy() drop_coordinates = df[[ 'dropoff_longitude','dropoff_latitude']].to_numpy() # Define the range of the number of points to sample min_n_points = 70 max_n_points = 100 # Define the number of times to repeat the process n_repeats = 100 get_ipython().system('rm -fr heatmaps') get_ipython().system('mkdir -p heatmaps') labels = [] filenames = [] for i in tqdm(range(n_repeats)): # Choose a random number of points to sample n_points = random.randint(min_n_points, max_n_points) # Sample n_points from the coordinates if i % 2 == 0: sampled_coordinates = random.sample(list(coordinates), n_points) label = "pickup" else: sampled_coordinates = random.sample(list(drop_coordinates), n_points) label = "dropoff" if i % 10 == 0: sampled_coordinates = np.array(sampled_coordinates) # Convert back to 2xn numpy matrix # Modify longitude values with Gaussian noise mean_lon = np.mean(sampled_coordinates[:, 1]) # Mean longitude value std_dev = 0.0 # Standard deviation of Gaussian noise gaussian_noise = np.random.normal(0, std_dev, len(sampled_coordinates)) # Generate Gaussian noise sampled_coordinates[:, 1] = mean_lon + gaussian_noise # Modify longitude values sampled_coordinates = list(sampled_coordinates) label = "noise" # Create a unique filename for the map filename = f"heatmaps/map_{i}.png" # Create a folium map centered at the first coordinate in the sampled points m = folium.Map(location=sampled_coordinates[0], zoom_start=12) # Add a heatmap layer for the sampled GPS coordinates HeatMap(sampled_coordinates).add_to(m) # Save the folium map as an HTML file #m.save(filename) #m.screenshot(filename) m.save('map.html') labels.append(label) filenames.append(filename) # Use selenium and chromedriver to take a screenshot of the HTML file and save it as a PNG image from selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.add_argument('--headless') driver = webdriver.Chrome(options=options) driver.get('file://' + pwd + '/map.html') driver.save_screenshot(filename) driver.quit() # ## Now look a the generated heatmaps # In[42]: files=get_ipython().getoutput("find heatmaps -name '*.png'") # In[45]: import fastdup ret = fastdup.generate_sprite_image(files, 48, ".")[0] from IPython.display import Image Image(filename=ret) # # Build fastdup model to cluster heatmaps together # In[23]: import fastdup fd = fastdup.create(input_dir='heatmaps', work_dir='heat_out') # In[24]: fd.run(overwrite=True, cc_threshold=0.94) # ## Plot heatmaps together # In[26]: cdf = fastdup.find_top_components('heat_out') labels_df = pd.DataFrame({'filename':filenames, 'label':labels}) label_dict = pd.Series(labels_df.label.values,index=labels_df.filename).to_dict() cdf['label'] = cdf['files'].apply(lambda x: [label_dict[y] for y in x]) fd.vis.component_gallery(load_crops=False, sort_by='comp_size', external_df=cdf, label_col='label', ascending=True) # ## Analyze outliers # In[27]: fd.vis.outliers_gallery() # ## Wrap Up # # Next, feel free to check out other tutorials - # # + โšก [**Quickstart**](https://nbviewer.org/github/visual-layer/fastdup/blob/main/examples/quick-dataset-analysis.ipynb): Learn how to install fastdup, load a dataset and analyze it for potential issues such as duplicates/near-duplicates, broken images, outliers, dark/bright/blurry images, and view visually similar image clusters. If you're new, start here! # + ๐Ÿงน [**Clean Image Folder**](https://nbviewer.org/github/visual-layer/fastdup/blob/main/examples/cleaning-image-dataset.ipynb): Learn how to analyze and clean a folder of images from potential issues and export a list of problematic files for further action. If you have an unorganized folder of images, this is a good place to start. # + ๐Ÿ–ผ [**Analyze Image Classification Dataset**](https://nbviewer.org/github/visual-layer/fastdup/blob/main/examples/analyzing-image-classification-dataset.ipynb): Learn how to load a labeled image classification dataset and analyze for potential issues. If you have labeled ImageNet-style folder structure, have a go! # + ๐ŸŽ [**Analyze Object Detection Dataset**](https://nbviewer.org/github/visual-layer/fastdup/blob/main/examples/analyzing-object-detection-dataset.ipynb): Learn how to load bounding box annotations for object detection and analyze for potential issues. If you have a COCO-style labeled object detection dataset, give this example a try. # # ## VL Profiler # If you prefer a no-code platform to inspect and visualize your dataset, [**try our free cloud product VL Profiler**](https://app.visual-layer.com) - VL Profiler is our first no-code commercial product that lets you visualize and inspect your dataset in your browser. # # [Sign up](https://app.visual-layer.com) now, it's free. # # [![image](https://raw.githubusercontent.com/visual-layer/fastdup/main/gallery/vl_profiler_promo.svg)](https://app.visual-layer.com) # # As usual, feedback is welcome! # # Questions? Drop by our [Slack channel](https://visualdatabase.slack.com/join/shared_invite/zt-19jaydbjn-lNDEDkgvSI1QwbTXSY6dlA#/shared-invite/email) or open an issue on [GitHub](https://github.com/visual-layer/fastdup/issues). #