#!/usr/bin/env python
# coding: utf-8

# In[2]:


get_ipython().run_cell_magic('capture', '', '%run shared.ipynb\n')


# In[ ]:


# questioning_KID = JUST Kjonnsidentitet
# questioning_Cis = JUST Cisness
# questioning_gender = Kjønnsidentitet and/or Cisness
# questioning_plus = JUST anyone who has SU in their orientation
# questioning  = any or all of the above
# 

# ```
# 
# NOTES FROM MIN:
# 
# DICTIONARY = the thing in squigly braces with colons. it is made up of:
#     KEYS on the left of the colon (usually strings but can be anything hashable)
#         hashable = things that can be turned into a number
#         list = modifiable, tuple = not, so tuples are hashable and can be keys; lists are not and cannot
#     VALUES on the right of the colon (can be literally anything)
#     
# It looks like this:
# ```
# 
# ```python
# dictionary_name = {
#     "key" : value(s),
#     "another_key" : more_value(s),
# }
# ```
# ```
# Dictionary of tuples (values look like (thing, other_thing, "thing", etc.))
# Dictionary of dictionaries, the values can be variable names for other dictionaries (like making a dictionary of all my groups), or nested literals
# 
# 
# - global namespace is the dictionary that when I type LG_df, what does that actually correspond to - how does it find the dataframe?
# 
# - 'get a handle on' - how to find the thing you want to change, find a method or a function that returns that thing so that you can modify or interact with it in some way
# 
# EXPRESSION = a bunch of symbols that has a result, calling a function can be an expression 
#              what you put inside brackets is an expression, whose result is always a series
#              one expression can have many terms
#              you can always store the result of an expression in a variable
#              rather than having df[big complicated expression]
#              variable = whatever you had in brackets
#              df[variable]
# 
# ARRAY = a sequence of things, a special list
# 
# INDEX = the thing (number or string) you use to look up items in a container ("indexing into an array" = the position in the array, 0 = the one at the beginning)
# 
# SERIES = the main thing that distinguishes a series from an array is that a series has a customizable index
#          is two arrays, kind of like a dictionary - the arrays are the index and the values
#          a column is a series - the index is like a row ID, the actual contents of the column are the values
#          if you do something like sorting or slicing it maintains the relationship between the index & value
#          a dataframe is just a collection of series that have the same index (that's why nans are there)
#          when you do arethmetic or something on a series, it usually means doing that operation element-wise (for each value, 
#          do the thing)
#              
#              orientation == bi - actually doing that operation on every item in the values, returning a new series with the 
#              same index where the values are the result of whatever operation you did
#                          
#          df["Hyppighet_n"] = df.Hyppighet.apply(Hyppighet_map.get)
#             In the order that it gets 'done':
#               - get the dataframe called df
#               - .Hyppighet = look up the column called Hyppighet (now we have a series)
#               - .apply() is a method on series that says call this function and return a new series where each item in the 
#               series is the result of calling that function on each item in the hyppighet series 
#               - .get is getting an item out of a dictionary
#               - df["Hyppighet_n"] = store the resulting series in a new column with this name same kind of things .isin() or 
#               .str.contains() or == or > (all booleans that return true/false)
#          
# THING[OTHERTHING] = "get item" (or "set item" if it's on the left side of an equals sign)
#     - how you get things out of a container of things (like a list or a tuple or a dataframe or a dictionary)
#     - In Pandas, if you treat it like a dictionary and give it a string, that will return a column. If you give it a 'mask' 
#         it will return a new dataframe or series (if you give a series a mask you get a series back, df get df back) where 
#         that mask is true
#     - with things like arrays and dataframes, one of the things you can pass to get item is a 'mask'
# 
# MASK: series of booleans (true/false) that has the same index as the dataframe or series (columns are series, series are not necessarily columns) you can mask a series or a dataframe, masking a dataframe is the same as masking all the series at once
# 
# df[df.column_name] or df[df.column_name == "something"] or df[df.column_name.isin("something")] 
# Different examples of applying one mask
# saying give me the subset (rows) where this mask is true - what you need to do when you are f.eks. creating a group
# 
# or 'if this is true or this is true...' - combining a bunch of masks into one mask:
# df[df.column_name == "something" | df.column_name.isin("something")]
# 
# inside the brackets is a mask - there are lots of ways to make a mask, any series of operations - the result of whatever you put together is a series of booleans with an index that matches the thing you are trying to mask
# 
# .dropna() changes the index because it's applying a mask - series.isna() = a series
# if you use dropna inside your filter, you have to also do it outside (filter and mask are the same thing)
# 
# creating a mask vs. applying a mask
# .isna() creates a mask
# .dropna() applies the inverse of that mask is equivalent to df[~df.isna()]
# 
# 
# When you have df[mask] you're changing the index (picking a subset) to be the items where the mask is true
# when do you apply the mask? "apply the mask" means another_df = df[mask] or another_series = series[mask]
# never use len on the mask (doesn't provide information, creating a mask doesn't change the length - always has one value for every row)
# if you're interested in the values of the same column or another column where the mask is true, that's when you apply the mask 
# 
# 
# Kira's way: use brackets when applying a mask, no brackets means making it but not applying it
# 
# Min's way: *shakes head in disappointment*
# Step 1: Creating the mask (big boolean expession that usually starts with df.) is saying find where these things are true
# Step 2: Give me the subset of rows where it was true in a way that I can interact with it (apply the mask in order to get that subset of another column)
# With a mask, there's only one question you can answer - for how many rows is this condition true (or false) mask.value_counts() or sum(mask)
# If you want to answer other questions about a group for which those things are true, then apply the mask (which creates a new df or series) and store it as a new variable so you can work with it
# 
# 
# .isin("T", "OT") = value of the column is exactly one of the items in the list (identical to a chained == "T" or == "OT" with separate parentheses for each). 
# .str.contains("T") will get T, Thing, Thingy, etc. = that substring appears anywhere in the column. 
# ````

# In[4]:


get_ipython().run_line_magic('psearch', '*df*')


# In[5]:


#pd.set_option('max_rows', None)


# In[6]:


#Show all the columns and rows
#pd.options.display.max_columns = None
#pd.options.display.max_rows = None

#all column names
list(df.columns)


# In[7]:


#Format decimal as percentage:

queer_frac = (sum(df.Seksuell_orientering.str.contains("Q"))/len(alle_skeive))

print("{:.1%}".format(queer_frac))
#or
print (f"{queer_frac:.1%}")


# In[8]:


#Make new categories
bibliotekarer = df[df.survey =="Bibliotekarer"]
cishet_bibliotekarer = df[(df.survey =="Bibliotekarer") & (~df.Skeiv)]
skeive_bibliotekarer = df[(df.survey =="Bibliotekarer") & (df.Skeiv)]
print(len(skeive_bibliotekarer))

ace_plus = df[df.Seksuell_orientering.str.contains("Ace")|df.RO.str.contains("Aro")]
len(ace_plus)


# In[9]:


#x_groups is a dictionary, and the strings are the keys, and the dataframes (LG, gay, etc.) are the values. 
#You can get one of the values by asking for one of the keys, so if you type x_groups["X"] the you will get the gay dataframe.

#Make them into a group df


# In[10]:


cisness_df["feilkjonnet"].hist()


# In[11]:


alle_skeive.Orienteringer.str.strip(",").str.split(",").apply(len).value_counts(normalize=True).plot(kind='bar')
alle_skeive.Orienteringer.str.strip(",").str.split(",").apply(len).value_counts()
#alle_skeive.Orienteringer.unique()


# In[12]:


#Drops all 'neutral' responses from those who did not change the pre-set neutral on ANY questions (N=12)

# columns = ["Utvalg_KID", "Utvalg_Orientering", "Utvalg_Intersex", "Utvalg_lykkelig", "Utvalg_fag", "Aldri_tenkt", "Utrygt_stille_spm", "Rep_matters_B", "Bib_pleier_ha", "Alltid_velkommen", "Trygge_rom_gen_B", "Ingen_rolle", "Ingenting_tilby", "Bibs_ansvar", "Minoritetsstress_ansatte", "Minoritetsstress_brukere", "Lhbtiq_vennlig", "Rom_for_forbedring", "Andre_brukere", "Helt_meg_selv"]
# non_participants = True
# for column in columns:
#     non_participants &= df[column] == 0

# df.loc[non_participants, columns] = pd.NA


# In[13]:


for key in gender_keys.keys():
    print_info_by_gender(key, column="Forklare_SIAN_B_U")
    print()


# In[ ]:


sum(s_women.Avrunding_B.isna())/len(s_women)


# In[ ]:


#Show non-captured entries (those with more than one orientation selected) by creating a union of all group indices and locate all rows not in that index
noncaptured = df[~df.NR.isin(exclusive_orientation_df.NR)]
#noncaptured.loc[noncaptured.Orienteringer == "Het,", ["RO", "Seksuell_orientering", "Kjonnsidentitet"]]
#noncaptured.Orienteringer.value_counts()