Source code for bipype

#!/usr/bin/python
import os
import argparse
import shutil
from metatranscriptomics_bipype import metatranscriptomics
from refseq_bipype import sample, prepare_taxonomy_stats
from settings_bipype import *


[docs]def parse_arguments(args=None):
    parser = argparse.ArgumentParser(
        fromfile_prefix_chars='@',
        description='bipype stands for BioInformatics-PYthon-PipE',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        epilog='All commands may be presented in a configuration file, '
               'fed to bipype with @ prefix as "bipype @my_conf_file", '
               'my_conf_file should contain all desired commands and their'
               'options (if applicable) one per line')

    general = parser.add_argument_group(
        'general',
        'performance and I/O related options')
    general.add_argument(
        "-t", "--threads",
        help='number of threads to be used',
        type=int,
        default=8)
    general.add_argument(
        "-m", "--mode",
        help='available modes: test, run',
        choices=['test', 'run'],
        type=str,
        default='test')
    general.add_argument(
        "--out_dir", "-o",
        type=str,
        metavar='OUTPUT_DIRECTORY',
        default='in_situ',
        help='Directory for output files, '
             'default value is not usable for metatranscriptomics')
    general.add_argument(
        "--input", "-i",
        nargs='*',
        default=None)
    general.add_argument(
        "--ins_len",
        default='9999',
        help='insert length - be advised - you better use it for single run',
        type=int)
    general.add_argument(
        "-postfix",
        type=str,
        help='alphanumerical postfix of processed file',
        default='')
    general.add_argument(
        "-e",
        help="use existing files",
        action='store_true')

    taxonomy_stats = parser.add_argument_group(
        'taxonomy_stats options',
        'options related to process of preparing taxonomy results')
    taxonomy_stats.add_argument(
        "-ot", "--output_type",
        nargs='*',
        default=['ITS', '16S', 'txt'],
        help='Choice of files searched for an analysis, coded: '
             '16S and ITS on usearches - file .usearch_ITS, .usearch_16S')

    dataclean = parser.add_argument_group(
        'input cleaning',
        'methods of cleaning input from noise')
    dataclean.add_argument(
        "-ic", "--initial_cleaning",
        choices=['usearch', 'fastx'],
        type=str,
        default='',
        help='Choice of initial cleaning method')
    dataclean.add_argument(
        "--cutadapt",
        type=str,
        default='',
        nargs=2,
        help='-cutadapt ADAPTER_FILE search_options, '
             'location of file with adapters to be used by cutadapt '
             '(possible "use_filenames" to determine adapters from hardcode), '
             'and list of usearches to be run on created files - '
             'possible options are 16S, ITS, both. '
             'Please note, that mapping options -16S, -ITS are completely '
             'irrelevant if you use cutadapt. '
             'Other note - this is !!!IMPORTANT!!! to present location of file '
             'with adapters as first option of this argument')

    mappings = parser.add_argument_group(
        'mapping',
        'mappings to be done during the run')
    mappings.add_argument(
        "-ITS",
        help='usearch ITS database',
        dest='to_calculate',
        action='append_const',
        const='ITS')
    mappings.add_argument(
        "-16S",
        help='usearch 16S database',
        dest='to_calculate',
        action='append_const',
        const='16S')
    mappings.add_argument(
        "-refseq",
        help='map samples on refseq: p - plant, f - fungi, b - both',
        nargs='?',
        choices=['f', 'p', 'b'],
        dest='to_calculate',
        action='append',
        const='f',
        default='f')
    mappings.add_argument(
        "-MEGAN",
        help='generate .rma files using MEGAN',
        dest='to_calculate',
        action='append_const',
        const='MEGAN')

    contigs = parser.add_argument_group(
        'contig reconstruction',
        'methods of contig reconstruction to be used during the run')
    contigs.add_argument(
        "-assembler",
        default=None,
        type=str,
        help='choose assembler: MH for Megahit, MV for MetaVelvet',
        choices=['MH', 'MV', None])
    contigs.add_argument(
        "-MV",
        default=None,
        help="MetaVelvet's options:"
             "[initial k-mer size - default = 31,"
             "[final k-mer size, [step - default=2 (odd numbers)]]]",
        nargs='*')
    contigs.add_argument(
        "-reconstruct",
        default=None,
        const="rec_",
        help='reconstruct relating to database, options: database_loc, prefix',
        nargs='?')
    contigs.add_argument(
        "-humann",
        help='mapping rapsearch using humann',
        dest='to_calculate',
        action='append_const',
        const='humann')
    contigs.add_argument(
        "-rapsearch",
        help='RAPsearch on protein database',
        nargs='?',
        choices=['rap_KEGG', 'rap_prot', 'rap_KO'],
        dest='to_calculate',
        action='append',
        const='rap_prot',
        default='rap_prot')

    db_loc = parser.add_argument_group(
        'databases',
        'locations of databases to be used for searches')
    db_loc.add_argument(
        "--db_16S",
        type=str,
        default=PATH_X16S_DB,
        help='location of 16S database used in usearch (bowtie indexed')
    db_loc.add_argument(
        "--db_ITS",
        type=str,
        default=PATH_ITS_DB,
        help='location of ITS database used in usearch (bowtie indexed)')
    db_loc.add_argument(
        "--db_refseq_fungi",
        type=str,
        default=[PREF_PATH_REF_FUNGI],
        nargs='*',
        help='location of refseq database to use for fungi analysis. '
             'Up to two paths allowed. If there are multiple databases with'
             'filenames like <path_which_you_specified><second_part_of_name>, '
             'all will be loaded.')
    db_loc.add_argument(
        "--db_refseq_plant",
        type=str,
        default=[PREF_PATH_REF_PLANT_1, PREF_PATH_REF_PLANT_2],
        nargs='*',
        help='location of refseq database to use for plants analysis. '
             'Up to two paths allowed. If there are multiple databases with'
             'filenames like <path_which_you_specified><second_part_of_name>, '
             'all will be loaded.')
    db_loc.add_argument(
        "--db_NCBI_taxonomy",
        type=str,
        default=PATH_NCBI_TAXA_DB,
        help='location of cPickled NCBI_tax_id, '
             'NCBI tax names and NCBI tax ids')
    db_loc.add_argument(
        "--db_reconstruct",
        type=str,
        default=PATH_RECONSTRUCT_DB,
        help='location of database for reconstruction')
    db_loc.add_argument(
        "--db_taxonomy_16S",
        type=str,
        default=PATH_16S_DATABASE,
        help='location of 16S database used in taxonomy classification '
             '(FASTA with specially formatted headers)')
    db_loc.add_argument(
        "--db_taxonomy_ITS",
        type=str,
        default=PATH_ITS_DATABASE,
        help='location of ITS database used in taxonomy classification '
             '(FASTA with specially formatted headers)')

    metatr = parser.add_argument_group(
        'metatranscriptomics',
        'parameters for metatranscriptomic pipe')
    metatr.add_argument(
        "--metatr_config",
        type=str,
        help='configuration file which is necessary to launch '
             'metatranscriptomic part of bipype'),
    metatr.add_argument(
        "-mot", "--metatr_output_type",
        type=str,
        choices=['old', 'new', 'both'],
        default='both',
        help='determines type of the one of the outputs from '
             'metatranscriptomic part of bipype')
    return parser.parse_args(args)


if __name__ == '__main__':
    opts = parse_arguments()
    if opts.metatr_config:
        metatranscriptomics(opts)
    else:
        if opts.input:
            temp_path = os.getcwd()
            if not os.path.exists(temp_path):
                os.makedirs(temp_path)
            for input_file in opts.input:
                shutil.copy(input_file, os.path.join(temp_path))
        sample(opts)
        prepare_taxonomy_stats(opts)