The original data sheet has each orchestra in a separate spreadsheet tab, which makes it harder to conduct new analysis or generate intersectional counts (e.g. pieces composed by Women of Colour). By appending all the tabs into one sheet, this analysis can be completed.
Written by Ben Coleman
import pandas as pd import numpy as np
# Define filepath and import workbook ImportFilePath = "2021-01-02_BREAKDOWN_ 2019-2020 ORCHESTRAL PROGRAMMING.xlsx" ICD1920Original = pd.read_excel(ImportFilePath, sheet_name=None)
# Creates final data frame ICD1920Combined = pd.DataFrame(None,None,['Orchestra', 'Composer', 'WorkTitle', 'WorkCount', 'Living', 'Women', 'Heritages'])
# Processes and appends individual orchestra worksheets for worksheetKey in ICD1920Original.keys(): if ICD1920Original[worksheetKey].columns[0:2].to_list() == ['Unnamed: 0', 'Unnamed: 1']: worksheetTemp = ICD1920Original[worksheetKey] # add orchestra column based on worksheetKey orchestraList = [worksheetKey] * len(worksheetTemp.index) worksheetTemp.insert(0,'Orchestra',orchestraList,True) worksheetTemp.columns=['Orchestra', 'Composer', 'WorkTitle', 'WorkCount', 'Living', 'Women', 'Heritages'] # remove top two summary rows and occasional "*Heritages bottom row" worksheetTemp = worksheetTemp[worksheetTemp.WorkTitle.notnull()] # append to final dataframe ICD1920Combined = ICD1920Combined.append(worksheetTemp)