Download and process PubMed/NLM journal catalog.
import os
import re
import pandas
# Download PubMed Journals
url = 'ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt'
! wget --no-verbose --directory-prefix download --timestamping {url}
2016-01-20 17:30:58 URL: ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt [1758] -> "download/.listing" [1]
# Read PubMed journals
path = os.path.join('download', 'J_Medline.txt')
with open(path) as read_file:
text = read_file.read()
# Create a dataframe of PubMed journals
rows = list()
pattern = re.compile('^-+$', re.MULTILINE)
for stanza in re.split(pattern, text):
stanza = stanza.strip()
if not stanza:
continue
row = dict()
for line in stanza.split('\n'):
key, value = line.split(': ', 1)
row[key] = value or None
rows.append(row)
journal_df = pandas.DataFrame(rows)
journal_df = journal_df.sort_values(by='NlmId')
# Order columns by percent missing
missing_pct = journal_df.isnull().mean().sort_values()
journal_df = journal_df[missing_pct.index]
missing_pct
JournalTitle 0.000000 JrId 0.000000 NlmId 0.000000 IsoAbbr 0.000346 MedAbbr 0.002869 ISSN (Print) 0.194760 ISSN (Online) 0.626205 dtype: float64
# Save journal dataframe as a TSV
path = 'data/pubmed-journals.tsv'
journal_df.to_csv(path, sep='\t', index=False)