#!/usr/bin/env python # coding: utf-8 # ## About this resource # All of the tutorial notebooks as well as information about the dependent package (`nma-ibl`) can be found at [nma-ibl GitHub repository](https://github.com/int-brain-lab/nma-ibl). # ## Setting up the environment (particularly for Colab users) # Please execute the cells below to install the necessary dependencies and prepare the environment. # In[ ]: # install IBL pipeline package to access and navigate the pipeline get_ipython().system('pip install --quiet nma-ibl') # Download data needed for plot recreation get_ipython().system('wget https://github.com/vathes/nma-ibl/raw/master/uuids_trained1.npy') # # Replication of study figures # One of the immense strenghts of [DataJoint](https://datajoint.io) pipelines lies in the tight data integrity and full tracking of all processing and computations as captured by the data pipeline. Here we demonstrate how a study figure based on the IBL pipeline can be replicated using data freshly fetched from the data pipeline. # In the study [A standardized and reproducible method to measure decision-making in mice](https://doi.org/10.1101/2020.01.17.909838), the authors have shown that the animal behavior in a visual decision-making task is similar across 9 labs in 7 institutions across 3 countries, when using a standardized, reproduciable experimental hardware, software, and procedures. # This notebook replicates Figure 2 from that work, which shows a similar learning rate of animals across different labs. # This notebook was generated based on [this repository](https://github.com/int-brain-lab/paper-behavior), allowing us to perform figure replications on a local machine! # Let's connect to the database again. Use the public credentials `ibl-public`: # In[ ]: import datajoint as dj dj.config['database.host'] = 'datajoint-public.internationalbrainlab.org' dj.config['database.user'] = 'ibl-public' dj.config['database.password'] = 'ibl-public' dj.conn() # explicitly verify that the connection to database can be established # # Import modules # To start with, we import some modules that will be used in the rest of the notebook: # In[ ]: import pandas as pd import numpy as np import os import seaborn as sns import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore") # We also import modules that allow us to interact with the schemas and tables in the IBL DataJoint pipeline. # In[ ]: from nma_ibl import reference, subject, behavior_analyses # Here are some overview of what each schema contains: # * `reference` schema contains lab, user and project information # * `subject` schema contains information about subjects # * `behavior_analyses` schema contains results of standardized analyses, including the training status # Here are some modules that defines pre-defined figure settings # In[ ]: from nma_ibl.paper_behavior_functions import (query_subjects, seaborn_style, group_colors, institution_map, seaborn_style) # # Initialize figure settings # In[ ]: seaborn_style() pal = group_colors() institution_map, col_names = institution_map() col_names = col_names[:-1] # # Query subjects that are trained # We pre-selected the "trained" animals based on the following criteria and save their uuids in the file `uuids_trained1.npy` # - 0% and 6% contrasts had been introduced to the contrast set. # - 200 trials were completed with >80% performance on easy (100% and 50% contrasts) trials in each of the last three sessions. # - A four-parameter psychometric curve (bias, lapse left, lapse right, threshold) fitted to performance on all trials from the last three sessions had parameter values of bias < 16, threshold < 19, and lapses < 0.2. # # In[ ]: uuids = np.load('uuids_trained1.npy', allow_pickle=True) # We could then fetch the animals in the data pipeline corresponding to their uuids: # In[ ]: subjects = subject.Subject & [{'subject_uuid': uuid} for uuid in uuids] # These are the 101 subjects reported in this study: # In[ ]: subjects # To include all information that are needed for subjects, we pre-queried subjects with the function `query_subject`. # In[ ]: use_subjects = query_subjects() use_subjects # One important field used in Figure 2 in this table is `date_trained`, which is the first date that the animal reached the trained criteria. # # Fetch data from the trained animals as a data frame # The summary statistics of the behavior are processed and saved in `behavior_analyses.BehavioralSummaryByDate`: # In[ ]: behavior_analyses.BehavioralSummaryByDate() # - performance: the correct rate on all trials of the date # - performance_easy: the correct rate on easy trials that contrast is greater than 50% # - n_trials_date: totoal number of trials on the date # - training_day: days since the animal is in training, starting from zero. # - training_week: days since the animal is in training, starting from zero. # Join the BehavioralSummaryByDate table with subject query to gather info together: # In[ ]: b = behavior_analyses.BehavioralSummaryByDate * use_subjects b # Then we could fetch the contents in the table and return the data as a data frame: # In[ ]: behav = b.fetch(order_by='institution_short, subject_nickname, training_day', format='frame').reset_index() behav['institution_code'] = behav.institution_short.map(institution_map) behav # Now compute how many mice are there for each institution and add the column to the dataframe # In[ ]: N = behav.groupby(['institution_code'])['subject_nickname'].nunique().to_dict() behav['n_mice'] = behav.institution_code.map(N) behav['institution_name'] = behav.institution_code + \ ': ' + behav.n_mice.apply(str) + ' mice' behav # # Fig 2a, plot learning curves of animals in each of the institution # In Fig 2a, we plot the performance on easy trials `performance_easy` as a function of `training_day` for each animal in each institution. # # For plotting purpose, we create another column only after the mouse is trained, and performance before the training date is marked as NaN: # In[ ]: behav2 = pd.DataFrame([]) for index, group in behav.groupby(['institution_code', 'subject_nickname']): group['performance_easy_trained'] = group.performance_easy group.loc[group['session_date'] < pd.to_datetime(group['date_trained']), 'performance_easy_trained'] = np.nan # add this behav2 = behav2.append(group) behav = behav2 # Finally we generate the figure. The following cell may take some time to run. # In[ ]: behav['performance_easy'] = behav.performance_easy * 100 behav['performance_easy_trained'] = behav.performance_easy_trained * 100 # plot one curve for each animal, one panel per lab fig = sns.FacetGrid(behav, col="institution_code", col_wrap=4, col_order=col_names, sharex=True, sharey=True, aspect=1, hue="subject_uuid", xlim=[-1, 41.5]) fig.map(sns.lineplot, "training_day", "performance_easy", color='gray', alpha=0.3) fig.map(sns.lineplot, "training_day", "performance_easy_trained", color='darkblue', alpha=0.3) fig.set_titles("{col_name}") for axidx, ax in enumerate(fig.axes.flat): ax.set_title(behav.institution_name.unique()[ axidx], color=pal[axidx], fontweight='bold') # overlay the example mouse sns.lineplot(ax=fig.axes[0], x='training_day', y='performance_easy', color='black', data=behav[behav['subject_nickname'].str.contains('KS014')], legend=False) fig.set_axis_labels('Training day', 'Performance (%) on easy trials') fig.despine(trim=True) # Performance on easy contrast trials (50% and 100% contrast) across mice and laboratories. Each panel represents a different lab, and each curve represents a mouse (gray). The transition from gray to blue indicates when performance criteria for "trained" are met. Black, performance for example mouse `KS014` # # Fig 2b - plot the learning curve averaged over animals for all institutions # In[ ]: # Plot all labs fig, ax1 = plt.subplots(1, 1, figsize=(5, 4)) sns.lineplot(x='training_day', y='performance_easy', hue='institution_code', palette=pal, ax=ax1, legend=False, data=behav, ci=None) ax1.set_title('All labs', color='k', fontweight='bold') ax1.set(xlabel='Training day', ylabel='Performance (%) on easy trials', xlim=[-1, 41.5]) seaborn_style() plt.tight_layout(pad=2) # # Print some statistics # In[ ]: behav_summary_std = behav.groupby(['training_day'])[ 'performance_easy'].std().reset_index() behav_summary = behav.groupby(['training_day'])[ 'performance_easy'].mean().reset_index() print('number of days to reach 80% accuracy on easy trials: ') print(behav_summary.loc[behav_summary.performance_easy > 80, 'training_day'].min()) # # Conclusion # And that's it! You have now completed the introductory tutorials for navigating and accessing IBL data pipepline, and hopefully this gets sets you on a good track to take a deeper dive into this rich and exciting datasets. # # Be sire tp visit [DataJoint.io](https://datajoint.io) for further learning resources for DataJoint. Also be sure to signup to our DataJoint Slack group (link on the website) to join the vibrant DataJoint user community!