#!/usr/bin/env python # coding: utf-8 # # Memory Footprint # # ## Analyze STAR Library # In[1]: import os import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import itertools import collections import tk # In[2]: build_bits = [32, 64] build_types = {'min': 'MinSizeRel', 'rel': 'Release', 'deb': 'RelWithDebInfo', 'dic': 'Release-nodict' } tk._prefix = '/Users/dsmirnov/work/pub-data/star-32-vs-64-build' # In[3]: builds_k = list(itertools.product(build_bits, build_types.keys())) builds_v = list(itertools.product(build_bits, build_types.values())) Build = collections.namedtuple('Build', ['id', 'bits', 'type', 'name']) builds = [Build(f'{kk}{kb}', kb, kk, f'{vb}-{vt}') for (kb, kk), (vb, vt) in zip(builds_k, builds_v)] builds # In[4]: def id_to_name(ids): if isinstance(ids, str): for b in builds: if b.id==ids: return b.name return 'NotFound' else: return [b.name for b in builds if b.id in ids] # In[5]: #id_to_name('min32') #id_to_name(['rel32', 'min64', 'rel64']) # In[6]: libpaths = {b.id: tk.get_libpath(b) for b in builds} logfiles = {b.id: tk.get_log(b) for b in builds} # In[7]: libfiles = {} for b in builds: for _, _, libfiles[b.id] in os.walk(libpaths[b.id]): break # In[8]: #libfiles['rel32'] == libfiles['min32'], libfiles['rel64'] == libfiles['min64'] #libfiles['rel32'], libfiles['min32'] # In[9]: len(libfiles['rel32']) # In[10]: libinfo = [] for libfile in libfiles['rel32']: libsizes = [os.path.getsize(libpath+libfile) for libpath in libpaths.values()] libinfo.append( (libfile, *libsizes) ) # In[11]: df_libs = pd.DataFrame(libinfo, columns=['name']+[b.id for b in builds]) df_libs # In[12]: bins = np.logspace(4, 9, 50) hist_kwargs = dict(bins=bins, histtype='step', lw=1) fig, axs = plt.subplots(1, 2, figsize=(8,4)) ax = axs[0] for b in builds: data = df_libs[b.id] label = f'{b.name}: {data.sum()/1024/1024:.0f} MB' ax.hist(data, label=label, **hist_kwargs, cumulative=False) ax.legend(prop={'size': 8}) ax = axs[1] for b in builds: data = df_libs[b.id] label = f'{b.name}: {data.sum()/1024/1024:.0f} MB' ax.hist(data, label=label, **hist_kwargs, cumulative=True) ax.legend(loc='lower right', prop={'size': 8}) for ax in np.ravel(axs): ax.semilogx(), ax.grid() ax.set_xlabel('File Size in Bytes') plt.tight_layout() plt.savefig('graphics/memory_lib_file_sizes.svg') # In[13]: df_libs_sorted = df_libs.sort_values('min32', ascending=False, inplace=False) #df_libs_sorted # In[14]: select = df_libs_sorted.index[:20] #select datax = df_libs_sorted.loc[select] #datax # ## Release vs RelWithDebInfo # In[15]: value_vars = [b.id for b in builds if b.id.startswith(('rel', 'deb'))] value_vars # In[16]: datax2 = datax.melt(id_vars='name', value_vars=value_vars) #datax2 # In[17]: fig = plt.figure(figsize=(7,5)) ax = sns.barplot(y='name', x='value', hue='variable', data=datax2) ax.legend(ax.legend().get_patches(), id_to_name(value_vars)) ax.set(xlabel='File Size in Bytes', ylabel='Library Name') ax.semilogx(); ax.grid() plt.tight_layout() plt.savefig('graphics/memory_lib_file_sizes_top20.svg') # ## Release vs RelWithDebInfo # In[18]: value_vars = [b.id for b in builds if b.id.startswith(('rel', 'dic'))] value_vars # In[19]: datax2 = datax.melt(id_vars='name', value_vars=value_vars) #datax2 # In[20]: fig = plt.figure(figsize=(7,5)) ax = sns.barplot(y='name', x='value', hue='variable', data=datax2) ax.legend(ax.legend().get_patches(), id_to_name(value_vars)) ax.set(xlabel='File Size in Bytes', ylabel='Library Name') ax.semilogx(); ax.grid() plt.tight_layout() plt.savefig('graphics/memory_lib_file_sizes_top20_nodict.svg') # ## Results from /proc/PID/status files # In[21]: memfiles = {b.id: tk.get_proc(b) for b in builds} # In[22]: dfs = {} for bid, memfile in memfiles.items(): if not os.path.isfile(memfile): continue dfs[bid] = pd.read_csv(memfile, skipinitialspace=True) # In[23]: df = pd.concat(dfs.values(), keys=dfs.keys(), names=['id']) # In[24]: df.reset_index(level=0, inplace=True) # In[25]: df # In[26]: len(dfs['rel32']), len(dfs['rel64']), len(dfs['deb32']), len(dfs['deb64']) # In[27]: def make_fig(data, figname, title='', labelsize=10): fig = plt.figure(figsize=(7,5)) ax = sns.pointplot(x='callerId', y='VmSize', hue='id', data=data, markers='x',scale=0.8)#, palette=['C0', 'C0']) ax = sns.pointplot(x='callerId', y='VmRSS', hue='id', data=data, markers='.', scale=0.8)#, palette=['C1', 'C1']) ax = sns.pointplot(x='callerId', y='VmLib', hue='id', data=data, markers='+', scale=0.8)#, palette=['C2', 'C2']) ax.tick_params(axis='x', labelrotation=90, labelsize=labelsize) ax.grid() ax.set_ylim(bottom=0) ax.set(title=title, xlabel='', ylabel='Memory Usage, MB'); #[t.set_text(id_to_name(t.get_text())) for t in ax.legend().get_texts()] ax.get_legend().remove() plt.tight_layout() plt.savefig('graphics/'+figname+'.svg') # In[28]: select = df.callerId.str.contains('Construct:') & df.id.str.contains('rel') data = df.loc[select].copy() data.callerId = data.callerId.str.replace('Construct:', '') make_fig(data, 'memory_calls_constructor_rel', 'StMaker::StMaker() Calls') # In[29]: select = df.callerId.str.contains('Construct') & (df.id.str.contains('rel')|df.id.str.contains('dic')) data = df.loc[select].copy() data.callerId = data.callerId.str.replace('Construct:', '') make_fig(data, 'memory_calls_constructor_rel_dic', 'StMaker::StMaker() Calls') # In[30]: select = df.callerId.str.contains('Load:') & (df.id.str.contains('rel')) #select = df.callerId.str.contains('Load:') & (df.id.str.contains('rel')|df.id.str.contains('dic')) data = df.loc[select].copy() data.callerId = data.callerId.str.replace('Load:', '') make_fig(data, 'memory_calls_load', 'ROOT::Load(\"libFoo.so\") Calls', 5) #make_fig(data, 'memory_calls_load_rel_dic', 'ROOT::Load(\"libFoo.so\") Calls', 5) # In[31]: select = df.callerId.str.contains('Load:') & (df.id.str.contains('rel')) #select = df.callerId.str.contains('Load:') & (df.id.str.contains('rel')|df.id.str.contains('dic')) data = df.loc[select].copy() data.callerId = data.callerId.str.replace('Load:', '') make_fig(data, 'memory_calls_load', 'ROOT::Load(\"libFoo.so\") Calls', 5) #make_fig(data, 'memory_calls_load_rel_dic', 'ROOT::Load(\"libFoo.so\") Calls', 5) # In[32]: select = df.callerId.str.contains('Init:') & df.id.str.contains('rel') data = df.loc[select].copy() data.callerId = data.callerId.str.replace('Init:', '') make_fig(data, 'memory_calls_init', 'StMaker::Init() Calls') # In[33]: def call_indices(df): load = [df.index[ (df.callerId.str.contains('Load:')) ][0]] con = df.index[ (df.callerId.str.contains('Construct:inputStream')) ] ini = df.index[ (df.callerId.str.contains('Init:inputStream')) ] clr = df.index[ (df.callerId.str.contains('Clear:analysis')) ] evt = df.index[ (df.callerId.str.contains('EndMaker:inputStream')) ] return load, con, ini, clr, evt # In[34]: idx_load, idx_con, idx_ini, idx_clr, idx_evt = call_indices(dfs['rel32']) # In[35]: def plot_vertical_lines(indices, color): for idx in indices: plt.plot([idx, idx], plt.ylim(), f'{color}') # In[36]: xticks_ = np.sort(idx_load + idx_con.tolist() + idx_ini.tolist() + idx_clr.tolist() + idx_evt.tolist()) xlabels_ = ['$\\longrightarrow$\nLoad Libs', '$\\longrightarrow$\nConstr.', '$\\longrightarrow$\nInit()', '$\\longrightarrow$\nClear()\n#1', '$\\longrightarrow$\nMake()', '$\\longrightarrow$\nClear()\n#2', '$\\longrightarrow$\nMake()', '$\\longrightarrow$\nClear()\n#3', '$\\longrightarrow$\nMake()'] # In[37]: fig = plt.figure( figsize=(8,4)) for b in builds: if b.id.startswith(('deb', 'dic')): continue data = dfs[b.id].VmSize plt.plot(np.arange(len(data)), data, label=b.name) plt.ylim(bottom=0, top=1500) plot_vertical_lines(idx_load, 'C0:') plot_vertical_lines(idx_con, 'C1:') plot_vertical_lines(idx_ini, 'C2:') plot_vertical_lines(idx_clr, 'C3:') plot_vertical_lines(idx_evt, 'C4:') plt.xticks(xticks_, xlabels_, ha='left', size=10) plt.legend() plt.grid() plt.gca().set(ylabel='MB') plt.tight_layout() plt.savefig('graphics/memory_calls_overall.svg') # In[38]: fig = plt.figure( figsize=(8,4)) for b in builds: if b.id.startswith(('deb', 'dic')): continue data = dfs[b.id].VmRSS plt.plot(np.arange(len(data)), data, label=b.name) plt.ylim(bottom=0, top=1500) plot_vertical_lines(idx_load, 'C0:') plot_vertical_lines(idx_con, 'C1:') plot_vertical_lines(idx_ini, 'C2:') plot_vertical_lines(idx_clr, 'C3:') plot_vertical_lines(idx_evt, 'C4:') plt.xticks(xticks_, xlabels_, ha='left', size=10) plt.legend() plt.grid() plt.gca().set(ylabel='MB') plt.tight_layout() plt.savefig('graphics/memory_calls_overall.svg') # In[39]: fig = plt.figure(figsize=(8,4)) data32 = dfs['rel32'].VmRSS.values data64 = dfs['rel64'].VmRSS.values delta = data64-data32 plt.plot(np.arange(len(data32)), delta, label='64 - 32') plt.ylim(bottom=0) plt.gca().set(ylabel='MB') plt.legend() plt.grid(True) plt.xticks(xticks_, xlabels_, ha='left', size=10) plt.twinx() plt.plot([0,len(delta)-1], [delta[0]/np.max(delta)*100, delta[-1]/np.max(delta)*100], '.', label='64 - 32') plt.gca().tick_params('y', colors='r') plt.grid(color='r') plt.ylim(bottom=0) plt.tight_layout() plt.savefig('graphics/memory_calls_diff.svg') # In[40]: #fig = plt.figure( figsize=(8,4)) # #for b in builds: # data = dfs[b.id].heap # plt.plot(np.arange(len(data)), data, label=b.name) # #plt.ylim(bottom=0) # #plt.legend() #plt.grid() #plt.gca().set(xlabel='MemStat Call', ylabel='Mb') #plt.tight_layout() # ## Statistical Analysis of ELF files # In[41]: #import cxxfilt from elftools.elf.elffile import ELFFile # In[42]: from elftools.construct import Container from pandas.io.json import json_normalize def todict(dict_with_container): mydict = {} for key, value in dict_with_container.items(): if isinstance(value, Container): mydict[key] = todict(value.__dict__) else: mydict[key] = value return mydict def symbol_to_dict(sym): try: return dict(name=cxxfilt.demangle(sym.name), entry=todict(sym.entry), cf=True) except: return dict(name=sym.name, entry=todict(sym.entry), cf=False) def make_sym_df(libfull): with open(libfull, 'rb') as f: try: elffile = ELFFile(f) symbols = [sym for sym in elffile.get_section_by_name('.symtab').iter_symbols()] symbols_list = [symbol_to_dict(symbol) for symbol in symbols] df = pd.DataFrame(json_normalize(symbols_list, sep='_')) isrootgen = df.name.str.contains('_dictLN_|_dict_|ROOTDict|G__') df = df.assign(r=isrootgen) return df except: return None def calc_stats(df): # Calculate fraction of c++filt failed frac_failed_cppfilt = float(np.count_nonzero(df.cf))/len(df.cf) total_st_size = df.entry_st_size.sum() root_st_size = df.loc[df.r].entry_st_size.sum() frac_root_size = float(root_st_size)/total_st_size frac_root_numb = float(np.count_nonzero(df.r))/len(df.r) return dict(size_st=total_st_size, #frac_cf=frac_failed_cppfilt, frac_rs=frac_root_size, frac_rn=frac_root_numb ) def make_stats_dict(build, libfile): libfull = tk.get_libpath(build) + libfile df = make_sym_df(libfull) if df is None: return None stats = calc_stats(df) size = os.path.getsize(libfull) return dict(size=size, **stats) # ### All Libaries Statistics # In[43]: libstats = [] libfiles_selected = [lf for lf in libfiles['rel32'] if '.so' in lf] #libfiles_selected = [lf for lf in libfiles_selected if any(s in lf for s in ['RTS', 'Sti', 'StEvent', 'Db', 'db'])] libfiles_selected = [lf for lf in libfiles_selected if any(s in lf for s in ['RTS', 'Sti', 'StEvent'])] libfiles_selected # In[44]: for idx, libfile in enumerate(libfiles_selected): print(f'{idx:3d} processing {libfile}') for b in builds: #if 'rel' not in b.id: continue if 'deb' in b.id: continue libstat = make_stats_dict(b, libfile) if libstat is None: print('^^^ skipping...') break libstats.append( dict(name=libfile, bid=b.id, **libstat) ) #import pprint #pprint.pprint(libstats) # In[45]: df = pd.DataFrame(json_normalize(libstats, sep='_')) # In[46]: df # In[47]: select = df.bid.str.contains('rel') |df.bid.str.contains('dic') data = df.loc[select] # In[48]: fig = plt.figure(figsize=(8,6)) ax = sns.barplot(x='size', y='name', hue='bid', data=data) ax.semilogx() ax.grid() _ = [t.set_text(id_to_name(t.get_text())) for t in ax.legend().get_texts()] plt.savefig('graphics/memory_select_elf_size.svg') # In[49]: fig = plt.figure(figsize=(8,6)) #ax = sns.barplot(x=data['size_st']/data['size'], y='name', hue='bid', data=data) ax = sns.barplot(x='size_st', y='name', hue='bid', data=data) ax.semilogx() ax.grid() _ = [t.set_text(id_to_name(t.get_text())) for t in ax.legend().get_texts()] plt.savefig('graphics/memory_select_elf_size_st.svg') # In[50]: fig = plt.figure(figsize=(8,6)) ax = sns.barplot(y='name', x='frac_rs', hue='bid', data=data) ax.semilogx() ax.grid() _ = [t.set_text(id_to_name(t.get_text())) for t in ax.legend().get_texts()] plt.savefig('graphics/memory_select_elf_frac_rs.svg') # ### Detailed View of Select Libraries # In[51]: libname = 'libSti.so' #libname = 'libStEvent.so' # In[52]: df = make_sym_df(libpaths['rel32']+libname) # In[53]: df # In[54]: plt.hist(df32.entry_st_size, label='32', bins=np.logspace(0, 5, 101), histtype='step') plt.hist(df32t.entry_st_size, label='32t', bins=np.logspace(0, 5, 101), histtype='step') plt.hist(df64.entry_st_size, label='64', bins=np.logspace(0, 5, 101), histtype='step') plt.semilogx() #plt.semilogy() plt.grid() plt.legend() # In[ ]: bins = np.logspace(0, 5, 101) plt.hist(df32.entry_st_size, label='32', bins=bins, histtype='step', cumulative=True) plt.hist(df32t.entry_st_size, label='32t', bins=bins, histtype='step', cumulative=True) plt.hist(df64.entry_st_size, label='64', bins=bins, histtype='step', cumulative=True) plt.semilogx() plt.grid() plt.legend(loc='lower center') # In[ ]: def select_symbols(df): sel_fnc = df.entry_st_info_type == 'STT_FUNC' sel_obj = df.entry_st_info_type == 'STT_OBJECT' sel_w = df.entry_st_info_bind == 'STB_WEAK' sel_g = df.entry_st_info_bind == 'STB_GLOBAL' sel_l = df.entry_st_info_bind == 'STB_LOCAL' return sel_fnc, sel_obj, sel_w, sel_g, sel_l # In[ ]: #df.loc[sel_l] # In[ ]: def select_symbol_sizes(df): sel_fnc, sel_obj, sel_w, sel_g, sel_l = select_symbols(df) f = df.loc[sel_fnc].entry_st_size o = df.loc[sel_obj].entry_st_size fl = df.loc[sel_fnc&sel_l].entry_st_size fg = df.loc[sel_fnc&sel_g].entry_st_size fw = df.loc[sel_fnc&sel_w].entry_st_size flr = df.loc[sel_fnc&sel_l&df.r].entry_st_size fgr = df.loc[sel_fnc&sel_g&df.r].entry_st_size fwr = df.loc[sel_fnc&sel_w&df.r].entry_st_size ol = df.loc[sel_obj&sel_l].entry_st_size og = df.loc[sel_obj&sel_g].entry_st_size ow = df.loc[sel_obj&sel_w].entry_st_size olr = df.loc[sel_obj&sel_l&df.r].entry_st_size ogr = df.loc[sel_obj&sel_g&df.r].entry_st_size owr = df.loc[sel_obj&sel_w&df.r].entry_st_size return f, o, fl, fg, fw, flr, fgr, fwr, ol, og, ow, olr, ogr, owr # In[ ]: f32, o32, fl32, fg32, fw32, fl32r, fg32r, fw32r, ol32, og32, ow32, ol32r, og32r, ow32r = select_symbol_sizes(df32) f32t, o32t, fl32t, fg32t, fw32t, fl32rt, fg32rt, fw32rt, ol32t, og32t, ow32t, ol32rt, og32rt, ow32rt = select_symbol_sizes(df32t) f64, o64, fl64, fg64, fw64, fl64r, fg64r, fw64r, ol64, og64, ow64, ol64r, og64r, ow64r = select_symbol_sizes(df64) # In[ ]: def change_in_size(s32, s64): change = (s64.sum() - s32.sum())/s32.sum()*100 return f'Total Size 32: {s32.sum()/1024/1024:.2f}Mb\nTotal Size 64: {s64.sum()/1024/1024:.2f}Mb\nChange in size: {change:+.1f}%' # In[ ]: h_kwargs = dict(bins=np.logspace(0, 4, 101), cumulative=-1, histtype='step') #lib_labels = ['64', '32'] lib_labels = ['32t', '32'] fig, axs = plt.subplots(1, 2, figsize=(8,4), sharey='row')#, gridspec_kw=dict(wspace=0)) ax = axs[0] #ax.hist([f64, f32], label=lib_labels, **h_kwargs) ax.hist([f32t, f32], label=lib_labels, **h_kwargs) ax.set_title('Functions') ax.set_ylabel('Number of Symbols') #ax.text(0.45, 0.6, change_in_size(f32, f64), transform=ax.transAxes) ax.text(0.45, 0.6, change_in_size(f32, f32t), transform=ax.transAxes) ax = axs[1] #ax.hist([o64, o32], label=lib_labels, **h_kwargs) ax.hist([o32t, o32], label=lib_labels, **h_kwargs) ax.set_title('Objects') ax.set_ylabel('Number of Symbols') #ax.text(0.45, 0.6, change_in_size(o32, o64), transform=ax.transAxes) ax.text(0.45, 0.6, change_in_size(o32, o32t), transform=ax.transAxes) for ax in np.ravel(axs): ax.grid(), ax.legend(), ax.semilogx(), ax.set_xlabel('Symbol Size in Bytes') plt.tight_layout() #f_lgw = {'Local': [fl64, fl32], 'Global': [fg64, fg32], 'Weak': [fw64, fw32]} #o_lgw = {'Local': [ol64, ol32], 'Global': [og64, og32], 'Weak': [ow64, ow32]} f_lgw = {'Local': [fl32t, fl32], 'Global': [fg32t, fg32], 'Weak': [fw32t, fw32]} o_lgw = {'Local': [ol32t, ol32], 'Global': [og32t, og32], 'Weak': [ow32t, ow32]} #f_lgwr = {'Local': [fl64r, fl32r], 'Global': [fg64r, fg32r], 'Weak': [fw64r, fw32r]} #o_lgwr = {'Local': [ol64r, ol32r], 'Global': [og64r, og32r], 'Weak': [ow64r, ow32r]} f_lgwr = {'Local': [fl32rt, fl32r], 'Global': [fg32rt, fg32r], 'Weak': [fw32rt, fw32r]} o_lgwr = {'Local': [ol32rt, ol32r], 'Global': [og32rt, og32r], 'Weak': [ow32rt, ow32r]} fig, axs = plt.subplots(4, 3, figsize=(8,12), sharey=True)#'row')#, gridspec_kw=dict(wspace=0)) for indx, (symlabel, symbols) in enumerate(f_lgw.items()): ax = axs[0, indx] ax.hist(symbols, label=lib_labels, **h_kwargs) ax.set_title(symlabel) ax.set_ylabel('Functions') if indx == 0 else None for indx, (symlabel, symbols) in enumerate(f_lgwr.items()): ax = axs[1, indx] ax.hist(symbols, label=lib_labels, **h_kwargs) ax.set_title(symlabel) ax.set_ylabel('Functions, ROOT Related') if indx == 0 else None for indx, (symlabel, symbols) in enumerate(o_lgw.items()): ax = axs[2, indx] ax.hist(symbols, label=lib_labels, **h_kwargs) ax.set_title(symlabel) ax.set_ylabel('Objects') if indx == 0 else None for indx, (symlabel, symbols) in enumerate(o_lgwr.items()): ax = axs[3, indx] ax.hist(symbols, label=lib_labels, **h_kwargs) ax.set_title(symlabel) ax.set_ylabel('Objects, ROOT Related') if indx == 0 else None for ax in np.ravel(axs): ax.grid(), ax.legend(), ax.semilogx(), ax.set_xlabel('Symbol Size in Bytes') plt.tight_layout() # ## Appendix # # ### Various Info about ELF Files # In[ ]: df[['cf', 'r']].agg(['value_counts']) # In[ ]: df[['entry_st_info_type', 'entry_st_info_bind']].agg(['value_counts']) # In[ ]: #section_names = [ (s.name, s['sh_type']) for s in elf32.iter_sections() if s.name and 'SYM' in s['sh_type']] #section_names # In[ ]: #sections = [s for s in elf32.iter_sections() if s.name] #sections # In[ ]: #load_segments = [s for s in elf32.iter_segments() if s.header.p_type == 'PT_LOAD'] #load_segments # In[ ]: #seg_sec = [(seg_indx, sec.name, sec['sh_type']) for seg_indx, seg in enumerate(load_segments) for sec in sections if seg.section_in_segment(sec)] #seg_sec # In[ ]: #symbols = [sym for sym in elf32.get_section_by_name('.symtab').iter_symbols()] #len(symbols) ##symbols