#!/usr/bin/env python # coding: utf-8 # # Snakemake Overview # # A Snakemake workflow is defined in terms of rules that are written in a file named Snakefile (similar to a Makefile with GNU Make). Rules consist of a name, input file(s), output file(s), and a shell command to generate the output from the input. Dependencies between rules are handled implicitly, by matching filenames of input files against output files. # # ## A first workflow # # To illustrate the use of Snakemake, we will test if a book follows the Zipf law: an empirical law which states that given a large sample of words, the frequency of any word is inversely proportional to its rank in the frequency table. # # The first rule (named `count_words`) will take a book stored in text file as input and generate a list of words sorted by the number of occurrences in the book and a second rule (named `fit_zipf`) will try to fit the data from previous step to check if it follows the Zipf law. The final step (rule named `plot_zipf`) will generate a graph from the generated data. # # These 3 rules are in the file named `Snakefile` that will be used to provide the workflow description to Snakemake. # #

# Note that the first rule defined in the file is the plot_zipf rule. By default Snakemake executes the first rule in the Snakefile, thus, the rule that produce the final result should be the first rule. #

# In[ ]: get_ipython().run_cell_magic('writefile', 'Snakefile', "\nrule plot_zipf: \n input: 'outputs/isles.dat', 'outputs/isles.fit.dat'\n output: 'outputs/isles.png'\n shell: 'python scripts/plotzipf.py outputs/isles.dat outputs/isles.fit.dat outputs/isles.png' \n\nrule fit_zipf:\n input: 'outputs/isles.dat'\n output: 'outputs/isles.fit.dat'\n shell: 'python scripts/fitzipf.py outputs/isles.dat outputs/isles.fit.dat'\n\nrule count_words: \n input: 'books/isles.txt' \n output: 'outputs/isles.dat' \n shell: 'python scripts/wordcount.py books/isles.txt outputs/isles.dat'\n") #

# The input and output files are defined relative to the current working directory #

# # We can check the validy of our workflow by performing a dry run. This can be done by passing the `-n` option when invoking Snakemake # In[ ]: get_ipython().system('snakemake --forceall -n') # We use the `--forceall` option to make sure that the whole workflow is executed regardless of the eventual presence of previously generated files. # # Another thing we can do is visualize the directed acyclic graph (DAG) our workflow of our workflow with the `--dag` option and pipe the output to the Graphviz `dot` command to get a PNG from the DAG. # In[ ]: get_ipython().system('snakemake --forceall --dag | dot -Tpng > dag.png') from IPython.display import Image Image('dag.png') # Now, let's run our workflow: # In[ ]: get_ipython().system('snakemake --forceall --cores 1') Image('outputs/isles.png') # If the output of a rule is already present on the filesystem then Snakemake will not run the job that generates it. In general, a job (rule) is executed when # # - output file is the target and does not exist # - output file needed by another executed job and does not exist # - input file newer than output file # - input file will be updated by other job # - execution is enforced (`--forceall`) # # For example, if we remove the `isles.png` file that is generated by the `plot_zipf` rule but keep the output of the other rules, then only the `plot_zipf` rule will be executed. # In[ ]: get_ipython().run_line_magic('rm', 'outputs/isles.png') get_ipython().system('snakemake --cores 1') # We can run part of the workflow by passing the target (output) we are interested in as the last argument when invoking Snakemake. Snakemake will only run the jobs that produces the dependencies of the job that produce the output you pass as a command-line argument. # In[ ]: get_ipython().system('snakemake --cores 1 --forceall outputs/isles.fit.dat') # ## Make our workflow more flexible and readable # # Our current workflow contains a lot of repetition. For example, we explicitly specify path for the input and output directives and repeat these parameters in the shell command. In order to avoid these repetitions we can refer to elements of the rule using `{input}` and `{output}`. In general, all local and global variables in a `Snakefile` can be accessed via their names in the [Python format minilanguage](https://docs.python.org/3/library/string.html#formatspec). # # In the case of lists or tuples, they are evaluated to a space-separated list. For example, if we have the following directives # # ```python # input: 'path/to/file1', 'path/to/file2' # shell: 'mycmd {input}' # ``` # # then `{input}` will be evaluated as `path/to/file1 path/to/file2`. You can also refer the individual element of the list using their indexes # # ```python # input: 'path/to/file1', 'path/to/file2' # shell: 'mycmd {input[0]} {input[1]}' # ``` # # An other option is to give a name to your inputs and access the values using the names # # ```python # input: # file1='path/to/file1', # file2='path/to/file2' # shell: 'mycmd {input.file1} {input.file2}' # ``` # # ### Wildcards # # We still have the name of the book is repeated for every rule both in the input and output directives. To avoid these repetitions and make our workflow more flexible, we can use wildcards. For example, the `count_word` rule can be rewritten as # # ```python # rule count_words: # input: 'books/{book}.txt' # output: 'outputs/{book}.dat' # shell: 'python scripts/wordcount.py {input} {output}' # ``` # # where we have defined the wildcard `{book}`. This rule can be interpreted as: *to generate the file named `output/something.dat` (output) find a file named `books/something.txt` (input) and run `wordcount.py input output`*. With all we have discussed above, we can write our workflow as # In[ ]: get_ipython().run_cell_magic('writefile', 'Snakefile', "\nrule plot_zipf: \n input: \n wordcount='outputs/isles.dat',\n fitting='outputs/isles.fit.dat'\n output: 'outputs/isles.png'\n shell: 'python scripts/plotzipf.py {input.wordcount} {input.fitting} {output}' \n\nrule fit_zipf:\n input: 'outputs/{book}.dat'\n output: 'outputs/{book}.fit.dat'\n shell: 'python scripts/fitzipf.py {input} {output}'\n\nrule count_words: \n input: 'books/{book}.txt' \n output: 'outputs/{book}.dat' \n shell: 'python scripts/wordcount.py {input} {output}'\n") # By using a wildcard, we only need to specify explicitly the name of the book in the `plot_zipf` rule and let Snakemake do the wildcards substitution to determine which rules should be run. # # In our case, Snakemake will determine that in order to produce `outputs/isles.png`, `outputs/isles.dat` and `outputs/isles.fit.dat` are required and that these files may be generated by substituting the `{book}` wildcard with `isles` in the `fit_zipf` and `count_words` rules. # # ### Extending the workflow # # The advantage of using wildcards is that the workflow can quickly be extended to include more books. For that we will use the fact that Snakefiles are Python code: we can include any Python code in our workflow. For example, we can define a list with the book titles: # # ```python # BOOKS = ['isles', 'abyss', 'last', 'sierra'] # ``` # # and then create a new list listing the plots we want to generate and use that list in a new rule that we will name `all`. # # ```python # PLOTS = ['outputs/{book}.png'.format(book=book) for book in BOOKS] # # rule all: # input: PLOTS # ``` # # An alternative is to use the Snakemake `expand` function which produces the same result as Python brackets syntax: # # ```python # PLOTS = expand('outputs/{book}.png', book=BOOKS) # ``` # # Our workflow thus becomes # In[ ]: get_ipython().run_cell_magic('writefile', 'Snakefile', "\nBOOKS = ['isles', 'abyss', 'last', 'sierra']\nPLOTS = expand('outputs/{book}.png', book=BOOKS)\n\nrule all:\n input: PLOTS\n\nrule plot_zipf: \n input: \n wordcount='outputs/{book}.dat',\n fitting='outputs/{book}.fit.dat'\n output: 'outputs/{book}.png'\n shell: 'python scripts/plotzipf.py {input.wordcount} {input.fitting} {output}' \n\nrule fit_zipf:\n input: 'outputs/{book}.dat'\n output: 'outputs/{book}.fit.dat'\n shell: 'python scripts/fitzipf.py {input} {output}'\n\nrule count_words: \n input: 'books/{book}.txt' \n output: 'outputs/{book}.dat' \n shell: 'python scripts/wordcount.py {input} {output}'\n") # In[ ]: get_ipython().system('snakemake --forceall --dag | dot -Tpng > dag.png') Image('dag.png') # In[ ]: get_ipython().system('snakemake --forceall --cores 1 --quiet') # ### Running a Python script # # Snakemake is not limited to running shell command. We can invoke a script using the `script` directive. For example, we can add script that will agreggate the data of all the books: # # ```python # DATS = expand(f'{OUTPUT_DIR}/{{book}}.dat', book=BOOKS) # # rule aggregate: # input: DATS # output: 'outputs/aggregated.dat' # script: 'scripts/aggregate.py' # ``` # # with the `aggregate.py` script # # ```python # data = {} # for input in snakemake.input: # read_input(input, data) # # sorted_data = reversed(sorted(data.items(), key = itemgetter(1))) # # with open(snakemake.output[0],'w') as outfile: # for key, value in sorted_data: # outfile.write('{:s} {:d}\n'.format(key, value)) # ``` # # note that inside the script we have access to a `snakemake` object and make use of it in order to iterate over all the inputs (`snakemake.input`) and determine where we need to write the output (`snakemake.input`). # In[ ]: get_ipython().run_cell_magic('writefile', 'Snakefile', "\nBOOKS = ['isles', 'abyss', 'last', 'sierra']\nPLOTS = expand('outputs/{book}.png', book=BOOKS)\nDATS = expand('outputs/{book}.dat', book=BOOKS)\nRESULTS = multiext('outputs/aggregated', '.dat', '.fit.dat', '.png')\n\nrule all:\n input: PLOTS, RESULTS\n\nrule aggregate:\n input: DATS\n output: 'outputs/aggregated.dat'\n script: 'scripts/aggregate.py'\n\nrule plot_zipf: \n input: \n wordcount='outputs/{book}.dat',\n fitting='outputs/{book}.fit.dat'\n output: 'outputs/{book}.png'\n shell: 'python scripts/plotzipf.py {input.wordcount} {input.fitting} {output}' \n\nrule fit_zipf:\n input: 'outputs/{book}.dat'\n output: 'outputs/{book}.fit.dat'\n shell: 'python scripts/fitzipf.py {input} {output}'\n\nrule count_words: \n input: 'books/{book}.txt' \n output: 'outputs/{book}.dat' \n shell: 'python scripts/wordcount.py {input} {output}'\n") # In[ ]: get_ipython().system('snakemake --forceall --cores 1 --quiet') # In[ ]: get_ipython().system('snakemake --forceall --dag | dot -Tpng > dag.png') Image('dag.png') # Snakemake in not limited to Python script. It can also be used with [R](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#r-and-r-markdown), [Julia](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#julia) and [Rust](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#rust). For all these languages, Snakemake provides an object that can be manipulated to gather information about the workflow. # ### Using a configuration file # # We can use a configuration file In order to set the parameters of our data analysis. For that we will create a YAML file named `config.yml` with the following content: # In[ ]: get_ipython().run_cell_magic('writefile', 'config.yml', "books_dir: 'books'\noutput_dir: 'outputs'\n") # In the Snakefile, we can make use of this configuration file using the `configfile` directive: # # ```python # configfile: 'config.yml' # ``` # # Snakemake will read this configuration file and make the values available in a dictionary named `config`. We can then use this dictionary to extract the configuration for our workflow # # ```python # OUTPUT_DIR = config['output_dir'] # BOOKS_DIR = config['books_dir'] # ``` # Then, we use this variable to define variable that will be used in the input and output directives of our workflow: # # ```python # BOOK_TXT = f'{BOOKS_DIR}/{{book}}.txt' # BOOK_DAT = f'{OUTPUT_DIR}/{{book}}.dat' # BOOK_FIT = f'{OUTPUT_DIR}/{{book}}.fit.dat' # ``` # Note that we use double curly braces (`{{book}}`) so that we obtain the wildcard `{book}` in the output of the formatted string. If the value of the `BOOKS_DIR` is `books`, then the value of `BOOK_TXT` in example above will be `'books/{book}.txt'`. # # We can also ensure that it is no longer necessary to explicitly specify the title of books by extracting the name of the books stored as `txt` files in the `books` directory. To achieve that we can use the Snakemake `glob_wildcards` function. # # ```python # BOOKS = glob_wildcards(f'{BOOKS_DIR}/{{book}}.txt').book # ``` # # The `glob_wildcards` function matches the given pattern against the files present in the filesystem and thereby infers the values for all wildcards in the pattern. This function returns a named tuple. In our case it will list the name of all the files in the `books` directory. # # With what as been discussed above, our workflow is now: # In[ ]: get_ipython().run_cell_magic('writefile', 'Snakefile', "\nconfigfile: 'config.yml'\n\nOUTPUT_DIR = config['output_dir']\nBOOKS_DIR = config['books_dir']\n\nBOOK_TXT = f'{BOOKS_DIR}/{{book}}.txt'\nBOOK_DAT = f'{OUTPUT_DIR}/{{book}}.dat'\nBOOK_FIT = f'{OUTPUT_DIR}/{{book}}.fit.dat'\n\nBOOKS = glob_wildcards(f'{BOOKS_DIR}/{{book}}.txt').book\n\nPLOTS = expand(f'{OUTPUT_DIR}/{{book}}.png', book=BOOKS)\nDATS = expand(f'{OUTPUT_DIR}/{{book}}.dat', book=BOOKS)\nRESULTS = multiext(f'{OUTPUT_DIR}/aggregated', '.dat', '.fit.dat', '.png')\n\nrule all:\n input: PLOTS, RESULTS\n\nrule aggregate:\n input: DATS\n output: 'outputs/aggregated.dat'\n script: 'scripts/aggregate.py'\n\nrule plot_zipf: \n input: \n wordcount=BOOK_DAT,\n fitting=BOOK_FIT\n output: 'outputs/{book}.png'\n shell: 'python scripts/plotzipf.py {input.wordcount} {input.fitting} {output}' \n\nrule fit_zipf:\n input: BOOK_DAT\n output: BOOK_FIT\n shell: 'python scripts/fitzipf.py {input} {output}'\n\nrule count_words: \n input: BOOK_TXT \n output: BOOK_DAT \n shell: 'python scripts/wordcount.py {input} {output}'\n") # In[ ]: get_ipython().system('snakemake --forceall --cores 1 --quiet') # ## More information about Snakemake # # - [Snakemake website](https://snakemake.github.io/) # - [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/)