import sys sys.version_info !wget http://aliman.s3.amazonaws.com/petl/petl-case-study-1-files.zip !unzip -o petl-case-study-1-files.zip import petl as etl etl.__version__ a = etl.fromtsv('snpdata.csv') b = etl.fromtsv('popdata.csv') a.header() b.header() b_renamed = b.rename({'Chromosome': 'Chr', 'Coordinates': 'Pos', 'Ref. Allele': 'Ref', 'Non-Ref. Allele': 'Nref', 'Derived Allele': 'Der', 'Mutation type': 'Mut', 'Gene': 'GeneId', 'Gene Aliases': 'GeneAlias', 'Gene Description': 'GeneDescr'}) b_renamed.header() common_fields = ['Chr', 'Pos', 'Ref', 'Nref', 'Der', 'Mut', 'GeneId', 'GeneAlias', 'GeneDescr'] a_common = a.cut(common_fields) b_common = b_renamed.cut(common_fields) a_common b_common b_common.display(vrepr=repr) a_common.valuecounts('Mut') b_common.valuecounts('Mut') a_conv = a_common.convert('Pos', int) b_conv = ( b_common .convert('Pos', int) .convert('Mut', {'SYN': 'S', 'NON': 'N'}) ) highlight = 'background-color: yellow' a_conv.display(caption='a', vrepr=repr, td_styles={'Pos': highlight}) b_conv.display(caption='b', vrepr=repr, td_styles={'Pos': highlight, 'Mut': highlight}) a_conv.nrows() b_conv.nrows() a_locs = a_conv.cut('Chr', 'Pos') b_locs = b_conv.cut('Chr', 'Pos') locs_only_in_a = a_locs.complement(b_locs) locs_only_in_a.nrows() locs_only_in_a.displayall(caption='a only') locs_only_in_b = b_locs.complement(a_locs) locs_only_in_b.nrows() locs_only_in_a.tocsv('missing_locations.csv') locs_only_in_a = a_conv.antijoin(b_conv, key=('Chr', 'Pos')) locs_only_in_a.nrows() ab_merge = etl.merge(a_conv, b_conv, key=('Chr', 'Pos')) ab_merge.display(caption='ab_merge', td_styles=lambda v: highlight if isinstance(v, etl.Conflict) else '') ab = etl.cat(a_conv.addfield('source', 'a', index=0), b_conv.addfield('source', 'b', index=0)) ab_conflicts = ab.conflicts(key=('Chr', 'Pos'), exclude='source') ab_conflicts.display(10) ab_conflicts_mut = ab.conflicts(key=('Chr', 'Pos'), include='Mut') ab_conflicts_mut.display(10, caption='Mut conflicts', td_styles={'Mut': highlight}) ab_conflicts_mut.nrows()