#!/usr/bin/env python
# coding: utf-8

# # CABS - Data Publication

# In[1]:


# https://jupyter.pages.rwth-aachen.de/documentation/FAQ.html
#!pip install --user pandas


# In this JupyterNotebook we publish all data we have gathered during the research
# of the project *Continumm Analysis of Blood Samples*.
# 
# Below you will find all the results of the analysis, which were published in the paper,
# but also the methods and algorithms we used.

# For the whole data analysis we used these libraries.

# In[2]:


import pandas as pd
from io import StringIO 


# ## Data sets
# All data is shown below.

# In[3]:


# https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
cabs_data ='''
UUID	ID	patient_ID	patient_ID_Biobank	patient_date_of_record	patient_date_of_birth	patient_age	patient_sex	patient_ARDS	patient_days_of_fever	sample_ID_lab	sample_cell_count	sample_handling	sample_date_slot_A	sample_date_slot_B	sample_date_slot_C
3qqyk	1	166	UKP5021575246F	2020-05-04	1964-01-07	56	male	true	29	LV2002010621	2.25	Frozen	2020-05-04	2020-05-08	2020-05-14
vpdn6	2	167	UKP7531655608C	2020-02-01	1937-01-22	83	male	false		LV2002029589	2.10	Fresh	2020-05-11	2020-05-13	2020-05-20
8bfor	3	168	UKP4734173975D	2020-09-04	1945-04-02	75	female	false		LV2002042498	1.58	Frozen		2020-05-14	2020-05-20
jfzcu	4	169	UKP0744831512E	2020-08-15	1964-01-13	56	male	true	43	LV2002511086	2.98	Fresh	2020-08-18	2020-08-24	2020-08-31
w3tcl	5	170	UKP3746829544V	2020-05-02	1992-05-02	28	male	true		LV2002051688	2.00	Froen	2020-08-21	2020-08-26	2020-09-02
jpnlw	6	171	UKP742105612D9	2020-09-06	1966-12-17	54		false	13	LV2002514028	2.33	Frozen	2020-09-07	2020-09-14	2020-09-23
biz5a	7	172	UKP65337288310	2020-09-23	1976-02-05	44	male	false		LV2002513827		Fresh	2020-09-24		2020-09-29
ooh5h	8	173	UKP25476702710	2020-05-01	1949-06-01	71	male	false	11	LV2002515631	2.70	Frozen	2020-09-18		
mk12i	9	174	UKP1284651396F	2020-09-27	1940-08-09	80	male	false	10	LV2002515278	1.43	Frozen	2020-09-28	2020-10-05	2020-10-12
f9pzb	10	175	UKP06046254078	2020-10-10	1964-06-01	56	female	true	8	LV2002515818	2.18	Frozen	2020-10-12	2020-10-16	2020-10-23
yl8ym	11	176	UKP3406107799F	2020-06-21	1973-06-25	47	male	true	34	LV2002518518	3.10	Frozen		2020-10-26	2020-11-02
6dqld	12	177	UKP253722024B9	2020-10-21	1937-10-01	83	male	false	5	LV2002514298	1.95	Frozen	2020-10-23	2020-10-27	2020-11-03
guvbx	13	178	UKP753968652DB	2020-05-04	1996-12-04	24	female	true	9	LV2002512694	1.40	Frozen	2020-10-26		
7ul1o	14	179	UKP2527316861B	2020-10-24	1953-01-15	67		true	22	LV2002515817	1.55	Frozen	2020-10-26	2020-10-30	2020-11-06
6vh0b	15	180	UKP219245104BE	2020-10-23	1944-05-16	76	male	true	11	LV2002515839	3.93	Frozen	2020-10-26	2020-10-30	2020-11-06
5cazn	16	181	UKP125283874C9	2020-08-02	1958-02-01	62	male	true		LV2002524576	4.90	Frozen	2020-10-27		2020-11-09
7lljq	17	182	UKP1616518832D	2020-10-28	1970-12-07	50	male	true	11	LV2002525650	1.40	Frozen	2020-10-29	2020-11-03	
tisd7	18	183	UKP78152624745	2020-09-28	1951-04-05	69		true		LV2002575948	2.70	Frozen		2020-11-03	2020-11-09
ppmn4	19	184	UKP468484497FE	2020-10-08	1969-10-12	51	male	true		LV2002515775	2.05	Fresh	2020-10-29	2020-11-03	2020-11-09
8dwba	20	185	UKP593262851D3	2020-11-29	1969-11-17	51	female	false		LV2002516928	0.78	Frozen	2020-10-30	2020-11-04	2020-11-10
'''


# We also give an overview of the columns used. But since there won’t be any calculations with the meta information we paste it as a plain table.
# 
# | Field                  | Original fieldname | description                                                               | category | controlled vocabulary | values            | origin                                                      |
# |------------------------|--------------------|---------------------------------------------------------------------------|----------|-----------------------|-------------------|-------------------------------------------------------------|
# | UUID                   |                    | a computer generated code (length of 5) for identifying a data set        | all      | false                 | alphanumeric      | externally generated (e.g. https://www.random.org/strings/) |
# | ID                     | FACS Sample ID     | id for handling the samples, human readable                               | all      | false                 | integer           | Laboratory of the CABS-Group                                |
# | patient_ID             | Pat-ID             | ID of patient in the system of the Biobank                                | patient  | false                 | integer           | Biobank                                                     |
# | patient_ID_Biobank     | Biobank-ID (UKP)   | ID for the database of the Biobank                                        | patient  | false                 | alphanumeric (15) | Biobank                                                     |
# | patient_date_of_record | Aufnahme           | Date when patient has been enrolled in study                              | patient  | false                 | date (YYYY-MM-DD) | Laboratory of the CABS-Group                                |
# | patient_date_of_birth  | Geburtstag         | Date of birth of patient                                                  | patient  | false                 | date (YYYY-MM-DD) | Biobank                                                     |
# | patient_age            | Alter              | Age of patient in years                                                   | patient  | false                 | integer           | Biobank                                                     |
# | patient_sex            | Geschlecht         | Sex of patient                                                            | patient  | true                  | male, female      | Biobank                                                     |
# | patient_ARDS           | Group              | Check whether patient had Acute Respiratory Distress Syndrome (ARDS)      | patient  | true                  | true, false       | Laboratory of the CABS-Group                                |
# | patient_days_of_fever  | FeverDays          | Amount of days patient had fever when enrolled in study                   | patient  | false                 | integer           | Laboratory of the CABS-Group                                |
# | sample_ID_lab          | Lab-ID             | ID of sample                                                              | sample   | false                 | alphanumeric      | Laboratory of the CABS-Group                                |
# | sample_cell_count      | Zellzahl           | Amount of cells in one sample (value times 10^6)                          | sample   | false                 | float             | Laboratory of the CABS-Group                                |
# | sample_handling        | Sample handling    | Condition of sample when arrived in laboratory                            | sample   | true                  | Fresh, Frozen     | Laboratory of the CABS-Group                                |
# | sample_date_slot_A     | Tag 1              | Date when blood sample has been taken from patient, day 1                 | sample   | false                 | date (YYYY-MM-DD) | Laboratory of the CABS-Group                                |
# | sample_date_slot_B     | Tag 5-7            | Date when blood sample has been taken from patient, between day 5 and 7   | sample   | false                 | date (YYYY-MM-DD) | Laboratory of the CABS-Group                                |
# | sample_date_slot_C     | Tag 12-14          | Date when blood sample has been taken from patient, between day 12 and 14 | sample   | false                 | date (YYYY-MM-DD) | Laboratory of the CABS-Group                                |

# As a first step we import the data set and store it as the variable `df`.
# There are certain adjustments to do parsing the data correctly.

# In[4]:


cabs = pd.read_csv(# using pandas reading the data
    StringIO(cabs_data), # referring to the data set by using the StringIO-method
    sep="\t", # defining the separator of columns which is a tab (default: ,)
    parse_dates=[4,5,13,14,15], # specifying columns with dates
)


# Getting some first meta information about the data in general.
# With this overview, generated with `cabs.info()` you can doube check whether the data has been read correctly.

# In[5]:


cabs.info()


# ## Data viewing

# Finally looking at the data itself and focusing on the first five rows.
# This is to check whether the parsing of dates, booleans, strings etc. has been done correctly.

# In[6]:


cabs.head()


# In the study we focus mainly on certain information, like age of patient, sex, amount of days of fever, day of the first blood sample.

# In[7]:


cabs[["UUID",'patient_date_of_birth','patient_sex','patient_days_of_fever',"sample_date_slot_A"]]


# ## Data calculations
# 
# ### Average
# 
# On page 9 in the paper we discuss the average days of fever the patients had during the study.
# This is how we got the result.

# In[8]:


int(cabs['patient_days_of_fever'].mean())


# Same goes for the average age of all patients we discuss in the conclusion.

# In[9]:


int(cabs.patient_age.mean())


# ### Age of patients
# 
# Of special interest were the blood samples (timeslot A) which were taken in September 2020.
# In the article we showed the calculated age of the patients when they had taken blood for the first time.
# 
# This is how we extracted the samples and the patients.

# In[10]:


timeslot = (cabs["sample_date_slot_A"] > "2020-09-01") & (cabs["sample_date_slot_A"] < "2020-10-01")
cabs.loc[timeslot]


# And now focusing on only the patient and the two days we want to calculate their gap from.

# In[11]:


cabs.loc[timeslot,["patient_ID","patient_date_of_birth","sample_date_slot_A"]]


# In[12]:


cabs['patient_age_slot_A'] = (cabs['sample_date_slot_A'] - cabs['patient_date_of_birth']).dt.days
cabs.loc[timeslot,["patient_ID","patient_date_of_birth","sample_date_slot_A",'patient_age_slot_A']].sort_values(by=['patient_age_slot_A'])


# This is how we got the oldest and youngest patient from September when he/she got taken blood for the first time.

# ### Data control
# 
# In the paper we mention that there were some difficulties since not all blood samples were taken 
# within the specific date range.
# In the columns `sample_date_slot_A/B/C` you see the dates.
# The interval is important and should be within a certain date range:
# 
# * `sample_date_slot_A` is from day one.
# * `sample_date_slot_B` should be after five to seven days later.
# * `sample_date_slot_C` after twelve to fourteen days after first blood sample or five to seven days after second.
# 
# The problematic samples are highlighted and we have treated them accordingly.

# In[13]:


cabs['A_B'] = (cabs['sample_date_slot_B'] - cabs['sample_date_slot_A']).dt.days
cabs['B_C'] = (cabs['sample_date_slot_C'] - cabs['sample_date_slot_B']).dt.days
cabs['A_C'] = (cabs['sample_date_slot_C'] - cabs['sample_date_slot_A']).dt.days

                                        
def highlight_max(x):
    return ['' if 5  <= v <= 7     else 'background-color: red'
            '' if 12 >= v <= 14  else ''
            for v in x]


cabs[['patient_ID',
       'sample_date_slot_A',
       'sample_date_slot_B',
       'sample_date_slot_C',
       'A_B',
       'B_C',
      'A_C']].style.apply(highlight_max, 
                           subset=pd.IndexSlice[:, ['A_B', 'B_C','A_C']],
                          ).highlight_null("yellow")


# ## Data visualization

# ## Plot of patients and sex
# 
# In the study we stress the distribution of sex in relation to certain age groups.
# We divided the patients into three groups based on their age
# 20-50; 50-65; 65-90. Each patient is counted to one of these groups.

# In[14]:


age_groups = pd.cut(cabs['patient_age'], bins=[20, 50, 65, 90])
age_groups


# Second we combine the age groups with the information about the individual sex.

# In[15]:


sex_age = pd.crosstab(age_groups, cabs['patient_sex'])
sex_age


# And finally we plot the data. This is figure 2 in the article.

# In[16]:


sex_age.plot(kind='bar',stacked=False)


# In[17]:


#import matplotlib.pyplot as plt
cabs.boxplot(column=['sample_cell_count'], return_type='axes', grid = False)


# In[18]:


cabs.boxplot(column=['patient_days_of_fever'], return_type='axes', grid = False)


# ## Data search
# 
# Imagine you only have the patient’s ID and you would like to get all the other information.

# In[19]:


patient_uuid ="jpnlw"


# In[20]:


patient_search = cabs.loc[cabs['UUID'] == patient_uuid ]
patient_search[['sample_date_slot_A','sample_date_slot_B','sample_date_slot_C']]


# In[ ]: