#!/usr/bin/env python # coding: utf-8 # ## Guide to using advice posted in Biostars answer https://www.biostars.org/p/9500884/#9501014 # # For https://www.biostars.org/p/9500884/#9501014 # ## Preparation # # #### Preparation: Installing software # # Based on the 'Quick installation instructions' in section #2 of 'Userguide.pdf' that comes in the unpacked version of http://eddylab.org/software/hmmer/hmmer-2.3.2.tar.gz. # In[1]: # Get the software get_ipython().system('curl -OL http://eddylab.org/software/hmmer/hmmer-2.3.2.tar.gz') # unpack it get_ipython().system('tar xzf hmmer-2.3.2.tar.gz') # go into unpacked software directory get_ipython().run_line_magic('cd', 'hmmer-2.3.2') # Configure for this system, and build the programs get_ipython().system(' ./configure') get_ipython().system('make') # #### Preparation: Setting up the problem # # Add the OP's provided example data to be converted, from the bottom of https://www.biostars.org/p/9500884/#9500951 . # In[2]: s='''ID 002L_FRG3G Reviewed; 320 AA. AC Q6GZX3; DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot. DT 19-JUL-2004, sequence version 1. DT 05-JUN-2019, entry version 38. DE RecName: Full=Uncharacterized protein 002L; GN ORFNames=FV3-002L; OS Frog virus 3 (isolate Goorha) (FV-3). OC Viruses; Iridoviridae; Alphairidovirinae; Ranavirus. OX NCBI_TaxID=654924; OH NCBI_TaxID=8295; Ambystoma (mole salamanders). OH NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). OH NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens). OH NCBI_TaxID=45438; Rana sylvatica (Wood frog). RN [1] RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019; RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.; RT "Comparative genomic analyses of frog virus 3, type species of the RT genus Ranavirus (family Iridoviridae)."; RL Virology 323:70-84(2004). CC -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass CC membrane protein {ECO:0000305}. DR EMBL; AY548484; AAT09661.1; -; Genomic_DNA. DR RefSeq; YP_031580.1; NC_005946.1. DR GeneID; 2947774; -. DR KEGG; vg:2947774; -. DR Proteomes; UP000008770; Genome. DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell. DR GO; GO:0016021; C:integral component of membrane; IEA:UniProtKB-KW. DR InterPro; IPR004251; Pox_virus_G9/A16. DR Pfam; PF03003; Pox_G9-A16; 1. PE 4: Predicted; KW Complete proteome; Host membrane; Membrane; Reference proteome; KW Transmembrane; Transmembrane helix. FT CHAIN 1 320 Uncharacterized protein 002L. FT /FTId=PRO_0000410509. FT TRANSMEM 301 318 Helical. {ECO:0000255}. FT COMPBIAS 263 295 Pro-rich. SQ SEQUENCE 320 AA; 34642 MW; 9E110808B6E328E0 CRC64; MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK VMFFVAGAVL VAILISTVRW // ID 012L_FRG3G Reviewed; 297 AA. AC Q6GZW3; DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot. DT 19-JUL-2004, sequence version 1. DT 05-JUN-2019, entry version 29. DE RecName: Full=Uncharacterized protein 012L; GN ORFNames=FV3-012L; OS Frog virus 3 (isolate Goorha) (FV-3). OC Viruses; Iridoviridae; Alphairidovirinae; Ranavirus. OX NCBI_TaxID=654924; OH NCBI_TaxID=8295; Ambystoma (mole salamanders). OH NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). OH NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens). OH NCBI_TaxID=45438; Rana sylvatica (Wood frog). RN [1] RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019; RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.; RT "Comparative genomic analyses of frog virus 3, type species of the RT genus Ranavirus (family Iridoviridae)."; RL Virology 323:70-84(2004). DR EMBL; AY548484; AAT09671.1; -; Genomic_DNA. DR RefSeq; YP_031590.1; NC_005946.1. DR GeneID; 2947784; -. DR KEGG; vg:2947784; -. DR Proteomes; UP000008770; Genome. PE 4: Predicted; KW Complete proteome; Reference proteome. FT CHAIN 1 297 Uncharacterized protein 012L. FT /FTId=PRO_0000410530. SQ SEQUENCE 297 AA; 32669 MW; 9B1D9247FF7E5D25 CRC64; MCAKLVEMAF GPVNADSPPL TAEEKESAVE KLVGSKPFPA LKKKYHDKVP AQDPKYCLFS FVEVLPSCDI KAAGAEEMCS CCIKRRRGQV FGVACVRGTA HTLAKAKQKA DKLVGDYDSV HVVQTCHVGR PFPLVSSGMA QETVAPSAME AAEAAMDAKS AEKRKERMRQ KLEMRKREQE IKARNRKLLE DPSCDPDAEE ETDLERYATL RVKTTCLLEN AKNASAQIKE YLASMRKSAE AVVAMEAADP TLVENYPGLI RDSRAKMGVS KQDTEAFLKM SSFDCLTAAS ELETMGF // ID 015R_FRG3G Reviewed; 322 AA. AC Q6GZW0; DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot. DT 19-JUL-2004, sequence version 1. DT 05-JUN-2019, entry version 40. DE RecName: Full=Uncharacterized protein 015R; GN ORFNames=FV3-015R; OS Frog virus 3 (isolate Goorha) (FV-3). OC Viruses; Iridoviridae; Alphairidovirinae; Ranavirus. OX NCBI_TaxID=654924; OH NCBI_TaxID=8295; Ambystoma (mole salamanders). OH NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). OH NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens). OH NCBI_TaxID=45438; Rana sylvatica (Wood frog). RN [1] RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019; RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.; RT "Comparative genomic analyses of frog virus 3, type species of the RT genus Ranavirus (family Iridoviridae)."; RL Virology 323:70-84(2004). DR EMBL; AY548484; AAT09674.1; -; Genomic_DNA. DR RefSeq; YP_031593.1; NC_005946.1. DR PRIDE; Q6GZW0; -. DR GeneID; 2947735; -. DR KEGG; vg:2947735; -. DR Proteomes; UP000008770; Genome. DR InterPro; IPR027417; P-loop_NTPase. DR SUPFAM; SSF52540; SSF52540; 1. PE 4: Predicted; KW Complete proteome; Reference proteome. FT CHAIN 1 322 Uncharacterized protein 015R. FT /FTId=PRO_0000410504. SQ SEQUENCE 322 AA; 36098 MW; 8E5F5B3DA9CDFF8A CRC64; MEQVPIKEMR LSDLRPNNKS IDTDLGGTKL VVIGKPGSGK STLIKALLDS KRHIIPCAVV ISGSEEANGF YKGVVPDLFI YHQFSPSIID RIHRRQVKAK AEMGSKKSWL LVVIDDCMDN AKMFNDKEVR ALFKNGRHWN VLVVIANQYV MDLTPDLRSS VDGVFLFREN NVTYRDKTYA NFASVVPKKL YPTVMETVCQ NYRCMFIDNT KATDNWHDSV FWYKAPYSKS AVAPFGARSY WKYACSKTGE EMPAVFDNVK ILGDLLLKEL PEAGEALVTY GGKDGPSDNE DGPSDDEDGP SDDEEGLSKD GVSEYYQSDL DD // ID 023R_IIV3 Reviewed; 106 AA. AC Q197D7; DT 16-JUN-2009, integrated into UniProtKB/Swiss-Prot. DT 11-JUL-2006, sequence version 1. DT 18-SEP-2019, entry version 20. DE RecName: Full=Uncharacterized protein 023R; GN ORFNames=IIV3-023R; OS Invertebrate iridescent virus 3 (IIV-3) (Mosquito iridescent virus). OC Viruses; Iridoviridae; Betairidovirinae; Chloriridovirus. OX NCBI_TaxID=345201; OH NCBI_TaxID=7163; Aedes vexans (Inland floodwater mosquito) (Culex vexans). OH NCBI_TaxID=42431; Culex territans. OH NCBI_TaxID=332058; Culiseta annulata. OH NCBI_TaxID=310513; Ochlerotatus sollicitans (eastern saltmarsh mosquito). OH NCBI_TaxID=329105; Ochlerotatus taeniorhynchus (Black salt marsh mosquito) (Aedes taeniorhynchus). OH NCBI_TaxID=7183; Psorophora ferox. RN [1] RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. RX PubMed=16912294; DOI=10.1128/jvi.00464-06; RA Delhon G., Tulman E.R., Afonso C.L., Lu Z., Becnel J.J., Moser B.A., RA Kutish G.F., Rock D.L.; RT "Genome of invertebrate iridescent virus type 3 (mosquito iridescent RT virus)."; RL J. Virol. 80:8439-8449(2006). DR EMBL; DQ643392; ABF82053.1; -; Genomic_DNA. DR RefSeq; YP_654595.1; NC_008187.1. DR GeneID; 4156230; -. DR KEGG; vg:4156230; -. DR OrthoDB; 16183at10239; -. DR Proteomes; UP000001358; Genome. PE 4: Predicted; KW Complete proteome; Reference proteome. FT CHAIN 1 106 Uncharacterized protein 023R. FT /FTId=PRO_0000377945. SQ SEQUENCE 106 AA; 12767 MW; 6620465F6FC52A18 CRC64; MGSYMLFDSL IKLVENRNPL NHEQKLWLID VINNTLNLEG KEKLYSLLIV HNKQQTKIYD PKEPFYDIEK IPVQLQLVWY EFTKMHLKSQ NEDRRRKMSL YAGRSP // ID 048L_FRG3G Reviewed; 83 AA. AC Q6GZS8; DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot. DT 19-JUL-2004, sequence version 1. DT 05-JUN-2019, entry version 27. DE RecName: Full=Uncharacterized protein 048L; GN ORFNames=FV3-048L; OS Frog virus 3 (isolate Goorha) (FV-3). OC Viruses; Iridoviridae; Alphairidovirinae; Ranavirus. OX NCBI_TaxID=654924; OH NCBI_TaxID=8295; Ambystoma (mole salamanders). OH NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). OH NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens). OH NCBI_TaxID=45438; Rana sylvatica (Wood frog). RN [1] RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019; RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.; RT "Comparative genomic analyses of frog virus 3, type species of the RT genus Ranavirus (family Iridoviridae)."; RL Virology 323:70-84(2004). DR EMBL; AY548484; AAT09707.1; -; Genomic_DNA. DR RefSeq; YP_031626.1; NC_005946.1. DR GeneID; 2947827; -. DR KEGG; vg:2947827; -. DR Proteomes; UP000008770; Genome. PE 4: Predicted; KW Complete proteome; Reference proteome. FT CHAIN 1 83 Uncharacterized protein 048L. FT /FTId=PRO_0000410516. SQ SEQUENCE 83 AA; 9566 MW; 52A13E9E325273F6 CRC64; MTAKTLDPSD YNVRDDSTTG MFTPVDRFVC DPESDRIIVR KIPPEWTIGN SMRFVHFTKE FTQTFDPSES PSNIVRHTNG KKK //''' get_ipython().run_line_magic('store', 's >seq_file.txt') # In[3]: get_ipython().system('pwd') # ---- # ## Do the conversion with `sreformat` # # Luckily in [follow-up, Mensur Dlakic pointed out that "sreformat is part of HMMer's squid library and it will be in the corresponding directory."](https://www.biostars.org/p/9500884/#9501048). # In[4]: # go into squid directory get_ipython().run_line_magic('cd', 'squid') # ### Display Usage information for `sreformat` # In[5]: get_ipython().system('./sreformat -h') # ### Run conversion # In[6]: #copy the sequence file to the directory we are in. get_ipython().system('cp ../seq_file.txt .') # In[7]: get_ipython().system('./sreformat fasta seq_file.txt') # Send the converted output to a file. # In[8]: get_ipython().system('./sreformat fasta seq_file.txt > seq_file.fa') # Display contents of converted file. # In[9]: get_ipython().system('cat seq_file.fa') # ## Use `sreformat` via Python # In[10]: import os os.system("./sreformat fasta seq_file.txt > made_by_python_seq_file.fa"); # In[11]: get_ipython().system('cat made_by_python_seq_file.fa') # ----- # ## Use Python script provided hugo.avila and GenoMax # # Python script provided hugo.avila and GenoMax, see https://www.biostars.org/p/9500884/#9500990 . (It has been edited in the string here to escape the `\n` symbols.) # In[12]: ss='''import os import sys import re def main(input_fname: str, output_fname: str) -> None: with open(output_fname, 'w') as f_out: with open(input_fname, 'r') as f_in: for record in re.split('//', f_in.read())[:-1]: record_id = re.split('\s+', record[record.index('ID'):])[1] sequence = ''.join(record[record.index('SQ'):].split('\\n')[1:]).replace(' ','') f_out.write(f'>{record_id}\\n{sequence}\\n') if __name__ == '__main__': main(input_fname=sys.argv[1], output_fname=sys.argv[2])''' get_ipython().run_line_magic('store', 'ss >script.py') # In[13]: get_ipython().run_line_magic('run', 'script.py seq_file.txt made_by_purepython_seq_file.fa') # You'd replace `%run` with your typical call to Python if you are running this on the command line. So perhaps: # # ```text # python script.py seq_file.txt made_by_purepython_seq_file.fa # ``` # # Or, # # ```text # python3 script.py seq_file.txt made_by_purepython_seq_file.fa # ``` # # Or [recommended for Windows machines](https://twitter.com/treyhunner/status/1471180042399068172), # # ```text # py script.py seq_file.txt made_by_purepython_seq_file.fa # ``` # # Let's see if that worked. # In[14]: get_ipython().system('cat made_by_purepython_seq_file.fa') # #### The expanded version of hugo.avila's script # # hugo.avila's script from https://www.biostars.org/p/9500884/#9501206 (It has been edited in the string here to escape the \n symbols.) # In[15]: ss2=''' import sys import re def main(input_fname: str, output_fname: str) -> None: with open(output_fname, 'w') as f_out: with open(input_fname, 'r') as f_in: # The input file is separated with '//' so if we split the file by these caracters # it is possible to get a list of records that can by looped: # "record1//record2//record//" --split('//')--> ["record1", "record2", "record3", ""]. # As you can see above the last item of the splitted string is a "" (empty string) # so we need to ignore it # li like this: ["record1", "record2", "record3", ""][:-1] -> ["record1", "record2", "record3"]. for record in re.split('//', f_in.read())[:-1]: # I cant see way the first code only returned the first id, i did run it and it worked for the sample. # I think that must be some format error in one of the sequences. This try block is kind of # ugly but it wil go to the end of your file, write the output and print the unformated records (if any). try: # split a record into a list of lines and get only the one that starts with 'ID'. # I think that maybe u only want the whole line so i did not pull only the id. id_line = list(filter(lambda x: x.startswith('ID'), record.split('\\n')))[0] sequence = ''.join(record[record.index('SQ'):].split('\\n')[1:]).replace(' ','') f_out.write(f'>{id_line}\\n{sequence}\\n') except Exception as e: print (f'Unformated record\\n{record}') pass if __name__ == '__main__': main(input_fname=sys.argv[1], output_fname=sys.argv[2])''' get_ipython().run_line_magic('store', 'ss2 >scriptalt.py') # In[16]: get_ipython().run_line_magic('run', 'scriptalt.py seq_file.txt made_by_purepython2_seq_file.fa') # You'd replace `%run` with your typical call to Python if you are running this on the command line. So perhaps: # # ```text # python scriptalt.py seq_file.txt made_by_purepython2_seq_file.fa # ``` # # Or, # # ```text # python3 scriptalt.py seq_file.txt made_by_purepython2_seq_file.fa # ``` # # Or [recommended for Windows machines](https://twitter.com/treyhunner/status/1471180042399068172), # # ```text # py script.py seq_file.txt made_by_purepython_seq_file.fa # ``` # # Let's see if that worked. # In[17]: get_ipython().system('cat made_by_purepython2_seq_file.fa') # ---- # # Enjoy! # # ---- # # ----