Source code for refseq_bipype

#!/usr/bin/python

from os import getcwd, walk, system, chdir, stat
from os.path import join as pjoin
from os.path import exists as pexists
from string import split, find, join
from itertools import combinations
from datetime import datetime
from copy import deepcopy
import cPickle as pickle
from settings_bipype import *


[docs]def cat_read(mode, fileext, paired_end=True):
    """Returns a dict with paths to sequence files
    from current working directory.

    The dict has following format::

        {
            directory_path1: [
                file1_in_this_directory_path,
                file2_in_this_directory_path,
                file3_in_this_directory_path
            ],
            directory_path2: [
                file4_in_this_directory_path,
                file5_in_this_directory_path,
                file6_in_this_directory_path
            ]
        }

    Paths to files (values of dict) are relative.

    Args:
        mode:       If mode is 'run', the gunzip command will be
                    executed to extract compressed files.
        fileext:    Extension of sequence files, which will be putted into
                    the dict.
        paired_end: If True, only paired-end reads are included
                    in the dict (default True).
    """
    double_ext = ['contigs.fa', 'txt.m8']
    if fileext in double_ext:
        cut_len = -2
    else:
        cut_len = -1
    seq_dict = {}
    if fileext == 'fa':
        fileext_l = ['fa', 'fasta', 'FASTA', 'FA', 'fas', 'fna', 'ffn', 'frn']
    else:
        fileext_l = [fileext]
    for root, folders, files in walk(getcwd()):
        for file_ in files:
            file_ext = '.'.join(split(file_, '.')[cut_len:])
            if file_ext in fileext_l:
                seq_dict = fastq_dict(seq_dict, root, file_)
            elif file_ext == 'gz':
                prop_ext = '.'.join(split(file_, '.')[cut_len-1:-1])
                if prop_ext in fileext_l:
                    to_gunzip = pjoin(root, file_)
                    print 'gunzip %s'%(to_gunzip)
                    if mode == 'run':
                        system('gunzip %s'%(to_gunzip))
                    seq_dict = fastq_dict(seq_dict, root, file_[:-3])
            else:
                pass
    if paired_end:
        seq_dict = paired_end_match(seq_dict)
    else:
        pass
    return seq_dict


[docs]def exist_check(program, names, todo):
    """Checks if a part of program is actually done.

    Function tries to find output files from different parts of program
    and basing on the results, removes corresponding tokens from
    todo list.

    In one special case (program == 'usearch_0') function checks size of
    the file specified by <names> argument in bits. If size == 0: todo=[]
    and PANIC MODE information is printed.

    Args:
        program:
            One of the following strings representing programs
            (de facto functions):

                - 'refseq'            (&)
                - 'usearch'           (@)
                - 'usearch_0'         (@)
                - 'MV'                (^)
                - 'rapsearch'         (@)
                - 'cutadapt'          (^)

        names:
            There are three possibilities:

                - (&) A dict in following format::

                    {file extension : path to file with corresponding extension}

                - (@) A string (one path)
                - (^) A list of paths

            Paths are hypothetical - if one really exists,
            appropriate token is removed from todo list.

        todo:
            List of tokens representing different parts of function specified
            by 'program' argument.

    Please, pay attention to mutual compatibility of arguments.

    Result:
        todo:     Checked list of tokens.

    See also:
        For more information please refer to descriptions of the following
        functions:

            - :func:`refseq_ref_namespace` and :func:`refseq_mapping`
            - :func:`usearch`
            - :func:`MV`
            - :func:`rapsearch`
            - :func:`cutadapt`
    """
    if program == 'refseq':
        if pexists(names['sam']):
            todo.remove('bowtie')
            print '%s found'%(names['sam'])
        if pexists(names['bam']):
            todo.remove('bam_make')
            print '%s found'%(names['bam'])
        if pexists(names['sorted.bam']):
            todo.remove('sort_index')
            print '%s found'%(names['sorted.bam'])
        if pexists(names['idxstats']):
            todo.remove('idxstat')
            print '%s found'%(names['idxstats'])
        if pexists(names['map_count']):
            todo.remove('perl')
            print '%s found'%(names['map_count'])
        if pexists(names['tax_count']):
            todo = []
            print '%s exists, to get intermediate files, omit -e option'%(names['tax_count'])
    if program == 'usearch':
        if pexists(names):
            todo = []
            print '%s found'%(names)
    if program == 'usearch_0':
        if stat(names).st_size == 0:
            todo = []
            print 'PANIC MODE: %s size = 0!'%(names)
    if program == 'MV':
        if pexists(names[0]) and pexists(names[1]):
            todo = []
            print '%s and %s found'%(names[0], names[1])
        elif pexists(names[0]):
            todo = []
            print '%s exists, to get intermediate files, omit -e option'%(names[0])
        else:
            if pexists(names[2]) and pexists(names[3]) and pexists(names[4]):
                todo.remove('velveth')
                print '%s %s %s found'%(names[2], names[3], names[4])
            if pexists(names[5]) and pexists(names[6]) and pexists(names[7]) and pexists(names[8]):
                todo.remove('velvetg')
                print '%s %s %s %s found'%(names[5], names[6], names[7], names[8])
    if program == 'rapsearch':
        if pexists(names):
            todo = []
            print '%s found, rapsearch not executing'%(names)
    if program == 'cutadapt':
        if pexists(names[0]):
            todo.remove('cutadapt0')
        if pexists(names[1]):
            todo.remove('cutadapt1')
        if pexists(names[2]):
            todo.remove('flash')
        if pexists(names[3]):
            todo.remove('fq2fa')
    return todo


[docs]def tax_id_reader():
    """Returns a dict with {GI:TaxID} from file.

    Keys and values are integers.

    File has following format::

        13  9913
        15  9915
        16  9771
        17  9771

    where first column is a GI, second is a TaxID.

    GLOBLAS:
        PATH_GI_TAX
    """
    print 'reading tax ids'
    gi_tax_dict = {}

    with open(PATH_GI_TAX) as file_:
        counter = 0
        for line in file_:
            gi, tax = split(line)
            gi_tax_dict[int(gi)] = int(tax)
            counter += 1
            if counter % 1000000 == 0:
                print "Added %i tax_ids"%(counter)
    file_.close()
    return gi_tax_dict


[docs]def tax_name_reader():
    """Returns a dict with {TaxID:scientific_name} from file.

    Lines without scientific names are omitted.

    Keys are integers, values are strings.

    Example:

        File::

            2   |       prokaryotes     |       prokaryotes <Bacteria>  |       in-part |
            6   |       Azorhizobium    |               |       scientific name |
            6   |       Azorhizobium Dreyfus et al. 1988        |               |       synonym |
            6   |       Azotirhizobium  |               |       equivalent name |
            7   |       ATCC 43989      |               |       type material   |
            7   |       Azorhizobium caulinodans        |               |       scientific name |

        Output::

            {6:'Azotirhizobium', 7:'Azorhizobium caulinodans'}

    GLOBALS:
        PATH_TAX_NAME
    """
    print 'reading tax names'
    tax_name_dict = {}
    with open(PATH_TAX_NAME) as file_:
        counter = 0
        for line in file_:
            line_l = split(line, '\t|\t')
            if line_l[3] == 'scientific name\t|\n':
                tax_id = line_l[0]
                if len(line_l[2]) == 0:
                    tax_name = line_l[1]
                else:
                    tax_name = line_l[2]
                tax_name_dict[int(tax_id)] = tax_name
            counter += 1
            if counter % 100000 == 0:
                print "Processed %i tax_name lines"%(counter)
    file_.close()
    return tax_name_dict


[docs]def idx_reader(file_path):
    """Reads file in samtools idxstats output format.

    Args:
        file_path:
            path to samtools idxstats output file.

            File is TAB-delimited with each line consisting of reference
            sequence name, sequence length, # mapped reads and # unmapped reads

    Returns:
        a dict with ``{GI: #_mapped_reads}`` (keys and values are integers).
    """
    idx_file_dict = {}
    for line in file(file_path).readlines():
        line_l = split(line)
        reads = int(line_l[2])
        if reads != 0:
            gi = int(split(line_l[0], '|')[1])
            idx_file_dict[gi] = reads
    return idx_file_dict


[docs]def idx_map(mode, file_, tax_name_dict, tax_id_dict, outfile):
    """Parses and writes data from samtools idxstats output file.

    Firstly, using ``idx_reader(file_)``, creates ``{GI:#_mapped_reads}``
    (a dict).

    Secondly, replaces every GI (key) with TaxID if appropriate
    one is available in tax_id_dict.

    Than, replaces every GI /TaxID (key) with scientific name if
    appropriate one is available in tax_name_dict.

    Finally, writes data (sorted by numbers of mapped reads in
    decreasing order) to outfile in key;value format, where:

        - key   is GI/TaxID/scientific name
        - value is number of mapped reads

    Args:
        mode:
            If (mode == 'run'), function do mentioned things.
            Elif (mode == 'test') function prints `file_` and outfile arguments.
            Else function do nothing.

        `file_`:
            Path to samtools idxstats output file.
            For more information refer to :func:`idx_reader` function.

        tax_name_dict:
            {TaxID:scientific_name} dict.
            For more information refer to :func:`tax_name_reader` function.

        tax_id_dict:
            {GI:TaxID} dict.__weakrefoffset__
            For more information refer to :func:`tax_id_reader` function.

        outfile:       Path to output file.
    """
    from operator import itemgetter
    tax_count_dict = {}
    if mode == 'run':
        idx_file_dict = idx_reader(file_)
        for gi in idx_file_dict:
            try:
                tax_id = tax_id_dict[gi]
                if tax_id in tax_count_dict:
                    tax_count_dict[tax_id] += idx_file_dict[gi]
                else:
                    tax_count_dict[tax_id] = idx_file_dict[gi]
            except KeyError:
                    tax_count_dict[gi] = idx_file_dict[gi]
        tmp_dict = deepcopy(tax_count_dict)
        for tax_gi_id in tax_count_dict:
            try:
                tmp_dict[tax_name_dict[tax_gi_id]] = tmp_dict.pop(tax_gi_id)
            except KeyError:
                pass
        sorted_dict = sorted(tmp_dict.iteritems(), key=itemgetter(1), reverse=True)
        tax_write = open(outfile, 'w')
        for tax_pair in sorted_dict:
            tax_write_string = '%s;%i\n'%(tax_pair[0], tax_pair[1])
            tax_write.write(tax_write_string)
        tax_write.close()
    elif mode == 'test':
        print 'IDX_MAP %s %s'%(file_, outfile)
    else:
        pass


[docs]def pair_uni_name(file_pair):
    """Returns one name for given tuple containing pair of filenames.

    Args:

        file_pair: tuple of paired_end read


    Example:

        input::

            ('Amp15_BFk_B_p_CGTACG_L001_R1_001.fastq','Amp15_BFk_B_p_CGTACG_L001_R2_001.fastq')

        output::

            'Amp15_BFk_B_p_CGTACG_L001_001'
    """
    file_body = '.'.join(split(file_pair[0], '.')[:-1])
    if 'R1' in file_body:
        R_loc = find(file_body, 'R1')
        out_name = file_body[:R_loc] + file_body[R_loc+3:]
    else:
        R_loc = find(file_body, 'R2')
        out_name = file_body[:R_loc] + file_body[R_loc+3:]
    return out_name


[docs]def refseq_ref_namespace(directory, seq, postfix, out_dir='in_situ', map_dir='in_situ'):
    """Returns a dict within:
        - keys are following file extensions: fastq, sam, sam2, bam,
          sorted, sorted.bam, idx_stats, tax_count, map_count
        - values are paths to file with corresponding extension.

        Filenames have following format::

            sample_name + '_' + postfix + extension

        where sample_name is basename of seq (if seq is a file) or
        output of pair_uni_name(seq) (if seq is a tuple)

    Args:

        directory: Path to directory, where file with .fastq extension
                   will be written

        seq:       Tuple of paired_end read or sequence file.

        postfix:   String added to the end of file basenames.

        out_dir:   Path to directory, where files with .sam, .sam2,
                   .bam,.sorted, .sorted.bam, .idx_stats and .tax_count
                   extensions will be written.
                   If out_dir is 'in_situ' (default), out_dir=directory

        map_dir:   Path to directory, where file with
                   .map_count extension will be written. If map_dir is
                   'in_situ'(default),map_dir=out_dir.
    """
    if out_dir == 'in_situ':
        out_dir = directory
    ref_namespace = {}
    if type(seq) == tuple:
        sample_name = pair_uni_name(seq)
        ref_namespace['fastq'] = (pjoin(directory, seq[0]), pjoin(directory, seq[1]))
    else:
        ref_namespace['fastq'] = pjoin(directory, seq)
        sample_name = split(seq, '.')[0]
    if postfix != '':
        sample_name = sample_name + '_' + postfix
    else:
        pass
    ref_namespace['sam'] = sample_name + '.sam'
    ref_namespace['sam2'] = sample_name + '_2' + '.sam'
    ref_namespace['bam'] = sample_name + '.bam'
    ref_namespace['sorted'] = sample_name + '.sorted'
    ref_namespace['sorted.bam'] = sample_name + '.sorted.bam'
    ref_namespace['idxstats'] = sample_name + '.idxstats'
    ref_namespace['tax_count'] = sample_name + '.tax_count'
    for element in ref_namespace:
        if element != 'fastq':
            ref_namespace[element] = pjoin(out_dir, ref_namespace[element])
    if map_dir == 'in_situ':
        ref_namespace['map_count'] = pjoin(out_dir, sample_name + '.map_count')
    else:
        ref_namespace['map_count'] = pjoin(map_dir, sample_name + '.map_count')
    return ref_namespace


[docs]def fastq_dict(seq_dict, root, file_):  # Why not defaultdict from collections?
    """Adds `file_` to list available under `seq_dict[root]`.

    If key `root` is not present, it will be created.
    """
    if root in seq_dict:
        seq_dict[root].append(file_)
    else:
        seq_dict[root] = [file_]
    return seq_dict


[docs]def paired_end_match(seq_dict):
    """Returns a dict with paths to paired-end reads only.
    Files will be matched as pair if their names differs only by parts:

        - R1 and R2, if the parts of filename are separated by underscores ``_``
        - 1 and 2, if the parts of filename are separated by dots ``.``

    Args:
        seq_dict: A dict determining locations of paired-end sequences.

            Example::

                {directory_path:file_path}


    Returns: a dict

        The returned dict has following format::

            {
                directory_path1: [
                    (paired-end_read1_R1_path, paired-end_read1_R2_path),
                    (paired-end_read2_R1_path, paired-end_read2_R2_path),
                    (paired-end_read3_R1_path, paired-end_read3_R2_path)
                ],
                directory_path2: [
                    (paired-end_read4_R1_path, paired-end_read4_R2_path)
                ]
            }

    Paths to files (both in input and output dict) are relative.
    """
    pe_dict = {}
    for directory in seq_dict:
        for seq1, seq2 in combinations(seq_dict[directory], 2):
            # for format with underscores
            seq1_u = set(split(seq1, '_'))
            seq2_u = set(split(seq2, '_'))
            # for format with dots, like output of metAMOS preprocessing
            seq1_d = set(split(seq1, '.'))
            seq2_d = set(split(seq2, '.'))
            if seq1_u ^ seq2_u == {'R1', 'R2'} or seq1_d ^ seq2_d == {'1', '2'}:
                if directory in pe_dict:
                    pe_dict[directory].append((seq1, seq2))
                else:
                    pe_dict[directory] = [(seq1, seq2)]
    return pe_dict


[docs]def bowtie2_run(mode, proc, ref, out, inp1, inp2=False):
    """If `mode` is 'run', launches Bowtie 2.

    Args:
        mode: should bowtie be runned?

        proc:
            [-p in Bowtie 2]

            Number of alignment threads to launch.

        ref:
            [-x in Bowtie 2]

            The basename of the index for the reference genome.

        out:
            [-S in Bowtie 2]

            File to write SAM alignments to.

        inp1: [-1 in Bowtie 2]

            Comma-separated list of files containing mate 1s
            (filename usually includes _1), for example::

                -1 flyA_1.fq,flyB_1.fq

        inp2: [-2 in Bowtie 2]

            Comma-separated list of files containing mate 2s
            (filename usually includes _2), for example::

                -2 flyA_2.fq,flyB_2.fq

            If inp2 is False: -2 argument will not be passed to Bowtie 2.
    """
    if inp2:
        print 'bowtie2 -p %i -x %s -1 %s -2 %s -S %s'%(proc, ref, inp1, inp2, out)
        if mode == 'run':
            system(PATH_BOWTIE2+' -p %i -x %s -1 %s -2 %s -S %s'%(proc, ref, inp1, inp2, out))
    else:
        print 'bowtie2 -p %i -x %s -f -U %s -S %s'%(proc, ref, inp1, out)
        if mode == 'run':
            system(PATH_BOWTIE2+' -p %i -x %s -f -U %s -S %s'%(proc, ref, inp1, out))


[docs]def sam_merge(sam1, sam2):
    """Merges two files in SAM format into one.

    Args:
        sam1: SAM file
        sam2: SAM file

    Returns:
       sam1 file in SAM format which contains now lines from both files.
       Function also removes sam2 file.
    """
    f1 = open(sam1, 'r')
    f2 = open(sam2, 'r')

    f1c = f1.readlines()
    f2c = f2.readlines()

    f1.close()
    f2.close()

    f1c_idx = 0
    for line in f1c:
        line = line.rstrip()
        if line[0] == '@':
            f1c_idx += 1

    f2c_idx = 0
    for line in f2c:
        line = line.rstrip()
        if line[0] == '@':
            f2c_idx += 1

    common = f1c[0:f1c_idx-1] + f2c[1:f2c_idx-1] + [f1c[f1c_idx-1]]
    cont = f1c[f1c_idx:]+ f2c[f2c_idx:]

    f3 = open(sam1, 'w')
    f3.write(join(common, ''),)
    f3.write(join(cont, ''),)
    f3.close()
    remove_sam2 = 'rm %s' % (sam2)
    system(remove_sam2)


[docs]def bam_make(mode, sam, bam):   # multi
    """If `mode` is 'run' and @SQ lines are present in the header,
    the function converts SAM to BAM

    Args:
        mode: string type
        sam: SAM file
        bam: BAM file
    """

    print 'samtools view -bS %s > %s' % (sam, bam)
    if mode == 'run':
        system('samtools view -bS %s > %s' % (sam, bam))


[docs]def bam_sorting(mode, bam, sorted_name):    # multi
    """If `mode` is 'run' the function reads bam (file in BAM format),
    sort it by aligned read position and write it out to BAM file
    whose name is: `sorted_name`.

    Args:
        mode: string type
        bam: BAM file
        sorted_name: name of output file
    """
    print 'samtools sort %s %s' % (bam, sorted_name)
    if mode == 'run':
        system('samtools sort %s %s' % (bam, sorted_name))


[docs]def bam_indexing(mode, sorted_bam):     # multi
    """If mode is 'run' creates an index file
    `sorted_bam`.bam.bai for the `sorted_bam`.bam file.

    Args:
        mode: string type
        sorted_bam: BAM file
    """
    print 'samtools index %s' % (sorted_bam)
    if mode == 'run':
        system('samtools index %s' % (sorted_bam))


[docs]def bam_idxstating(mode, sorted_bam, idxstats):     # multi
    """Launch samtools idxstats.

    Args:
        mode:
            if `mode` is 'run', launch the program.
        sorted_bam:
            path to file in BAM format, which is the input to samtools.
        idxstats:
            path to file, where the output from samtools will be written.
    """
    print 'samtools idxstats %s > %s' % (sorted_bam, idxstats)
    if mode == 'run':
        system ('samtools idxstats %s > %s' % (sorted_bam, idxstats))


[docs]def idxstat_perling(mode, idxstats, map_count):     # multi
    """Counts and writes to `map_count` both:
       - sum of all (numbers of mapped reads) in `idxstats`,
       - sum of all (numbers of unmapped reads) in `idxstats`.

    Function launches short "one-liner" in perl.

    Args:
        mode:
            If mode is 'run' the perl "one-liner" will be launched.

        idxstats:
            Path to samtools idxstats output file.
            Please refer to :func:`idx_reader` for more information.

        map_count:
            Path to file, where differences will be written.

    Output:
        Output has following format::

            #mapped - #unmapped

        Example::

            123 - 456

    TODO:
        This should be rewritten to python (why to use perl here?)
    """
    perl_command = 'perl -e \'while(<>){chomp;@a=split "\t", $_; $b+=$a[2]; $c+=$a[3];} print "$b - $c\n";\' %s > %s' % (
        idxstats, map_count
        )
    print perl_command
    if mode == 'run':
        system(perl_command)


[docs]def refseq_mapping(mode, e, directory, pair, postfix, refseq, tax_name_dict, tax_id_dict, threads, map_dir, refseq_2=False):
    """Aligns reads to reference sequence(s) using Bowtie 2,
    than parses and writes data to files.

        - Firstly, Bowtie 2 align reads to reference sequence (or sequences,
          if refseq_2 is not False and merge output SAM files).
        - then, BAM files are made, sorted and indexed,
        - the function launches 'samtools idxstats',
        - in the next step, perl 'one-liner' counts and writes to file
          sums of mapped reads and unmapped reads,
        - finally, :func:`idx_map` function is called:
          it parses data from samtools idxstats output file and writes data to
          `outfile` in ``key;value`` format, where:

                - key     is    GI/TaxID/scientific name
                - value   is    number of mapped reads.

    Args:
        mode:
            If `mode` is 'run', the function operates on data and prints
            information about it. Otherwise, prints information,
            without operating on data (it is a kind of test).

        e:
            If true, checks if a part of workflow is actually done and doesn't
            duplicate this jobs.

            For more information, please refer to ``e`` argument in
            :func:`exist_check` function.

        directory:
            Path to directory, where files will be written.

        pair:
            Tuple of paths to file (paired-end reads) OR
            a string with path to a sequence file.

        postfix:
            String added to the end of files basenames.

            Argument for :func:`refseq_ref_namespace`.

        refseq:

            The basename of the index for the reference genome.

            Argument for :func:`bowtie2_run`.

        tax_name_dict:

            A dict: ``{TaxID:scientific_name}``

            For more information refer to :func:`tax_name_reader`.

            Argument for :func:`idx_map`.

        tax_id_dict:

            A dict: ``{GI:TaxID}``

            For more information refer to :func:`tax_id_reader`.

            Argument for :func:`idx_map`.

        threads:
            Number of threads for Bowtie 2 calculations.

        map_dir:
            Directory where sums of mapped and unmapped reads will be written.
            For more information refer to :func:`idxstat_perling`.

        refseq_2:

            The basename of the index for the reference genome.

            Argument for :func:`bowtie2_run`.

            If selected, launches Bowtie 2 on it, then merges output with
            output from Bowtie 2 launched on 'refseq'.

            If refseq_2 == False, Bowtie 2 is working only on 'refseq' argument.

    See Also:

        For more information take a look at following functions:

        :func:`refseq_ref_namespace`
        :func:`exist_check`
        :func:`bowtie2_run`
        :func:`sam_merge`
        :func:`bam_make`
        :func:`bam_sorting`
        :func:`bam_indexing`
        :func:`bam_idxstating`
        :func:`idxstat_perling`
        :func:`idx_map`
    """
    todo = ['bowtie', 'bam_make', 'sort_index', 'idxstat', 'perl', 'idx_map']
    ref_namespace = refseq_ref_namespace(directory, pair, postfix, 'in_situ', map_dir)
    if e:
        todo = exist_check('refseq', ref_namespace, todo)
    if 'bowtie' in todo:
        if type(pair) == tuple:
            print "\nAnalysis of %s %s vs %s\n"%(pair[0], pair[1], refseq)
            bowtie2_run(mode, threads, refseq, ref_namespace['sam'], ref_namespace['fastq'][0],
                    ref_namespace['fastq'][1])
            if refseq_2:
                print "\nAnalysis of %s %s vs %s\n"%(pair[0], pair[1], refseq_2)
                bowtie2_run(
                    mode, threads, refseq_2, ref_namespace['sam2'], ref_namespace['fastq'][0],
                    ref_namespace['fastq'][1]
                    )
        else:
            print "\nAnalysis of %s vs %s\n"%(pair, refseq)
            bowtie2_run(mode, threads, refseq, ref_namespace['sam'], ref_namespace['fastq'])
            if refseq_2:
                print "\nAnalysis of %s %s vs %s\n"%(pair[0], pair[1], refseq_2)
                bowtie2_run(mode, threads, refseq_2, ref_namespace['sam2'], ref_namespace['fastq'])
    if 'bam_make' in todo:
        if refseq_2:
            print 'Merging %s and %s'%(ref_namespace['sam'], ref_namespace['sam2'])
            if mode == 'run':
                sam_merge(ref_namespace['sam'], ref_namespace['sam2'])
        bam_make(mode, ref_namespace['sam'], ref_namespace['bam'])
    if 'sort_index' in todo:
        bam_sorting(mode, ref_namespace['bam'], ref_namespace['sorted'])
    if 'idxstat' in todo:
        bam_indexing(mode, ref_namespace['sorted.bam'])
        bam_idxstating(mode, ref_namespace['sorted.bam'], ref_namespace['idxstats'])
    if 'perl' in todo:
        idxstat_perling(mode, ref_namespace['idxstats'], ref_namespace['map_count'])
    if 'idx_map' in todo:
        idx_map(mode, ref_namespace['idxstats'], tax_name_dict, tax_id_dict, ref_namespace['tax_count'])


[docs]def ins_len_read(pair, cat):
    """Return 200 if the read length is less then 200 or 500 in otherwise.

    Args:
        pair: tuple of paired_end read
        cat: name of folder with the sample files

    Example:
        input::

            ('Amp15_BFk_B_p_CGTACG_L001_R1_001.fastq','Amp15_BFk_B_p_CGTACG_L001_R2_001.fastq'), 'catalog_name'

        output::

            500
    """

    sample_name = pair[0]
    zeros = find(sample_name, '00')     # WARNING HARDCODE
    try:
        ins_len = int(sample_name[zeros - 1:zeros + 2])
    # CHANGED
    except ValueError:
        aqq = len(open(pjoin(cat, sample_name), 'r').readlines()[1].rstrip())
        if aqq > 200:
            ins_len = 500
        else:
            ins_len = 200
    return ins_len


[docs]def gzip_MV(MV_dir):
    """Return archive with catalogs Sequences, Roadmaps, PreGraph, Graph2,
    LastGraph and files with statistics.

    Args:
        MV_dir: folder with Velvet results

    """
    gzip_list = [
        'Sequences', 'Roadmaps', 'PreGraph', 'Graph2', 'contigs.fa',
        'stats.txt', 'LastGraph', 'meta-velvetg.Graph2-stats.txt',
        'meta-velvetg.LastGraph', 'meta-velvetg.LastGraph-stats.txt',
        'meta-velvetg.split-stats.txt'
    ]
    to_gzip = ''
    for element in gzip_list:
        to_gzip += ' %s' % (pjoin(MV_dir, element))
        to_gzip_comm = 'tar --remove-files -czf %s %s' % (pjoin(MV_dir, 'intermediates.tgz'), to_gzip)
        system(to_gzip_comm)


[docs]def rapsearch(mode, e, contig_loc, rap_out, KEGG=None):
    """Runs RAPSearch with using KEGG databases for similarity search.

    Args:
        mode: if `mode` is 'run', then program runs rapsearch
        e: if e=True, then function runs exist_check function
        contig_loc: a query file
        rap_out: output file name
        KEGG: default is None, if KEGG= KO,
            then ko.pep.rapsearch.db is chosen as protein database

    HARDCODED:
        if KEGG='masl'

    GLOBALS:
        - path to RAPSearch program: PATH_RAPSEARCH
        - paths to data for RAPSearch:
            PATH_REF_PROT_MASL28,
            PATH_REF_PROT_KO,
            PATH_REF_PROT_ELSE
    """
    rap_log = rap_out + '.log'
    rap_err = rap_out + '.err'
    if KEGG == 'masl':
        ref_prot_loc = PATH_REF_PROT_MASL28
        rap_com = '%s -q %s -d %s -o %s -z 12 -v 20 -b 1 -t n -a t 1> %s 2> %s'%(
            PATH_RAPSEARCH, contig_loc, ref_prot_loc, rap_out, rap_log, rap_err
            )
    elif KEGG == 'KO':
        ref_prot_loc = PATH_REF_PROT_KO
        rap_com = '%s -q %s -d %s -o %s -z 12 -v 20 -b 1 -t n -a t 1> %s 2> %s'%(
            PATH_RAPSEARCH, contig_loc, ref_prot_loc, rap_out, rap_log, rap_err
            )
    else:
        ref_prot_loc = PATH_REF_PROT_ELSE
        rap_com = '%s -q %s -d %s  -o %s -z 10 -e 0.001 -b 100 -v 100 -g T -a T 1> %s 2> %s'%(
            PATH_RAPSEARCH, contig_loc, ref_prot_loc, rap_out, rap_log, rap_err
            )
    print rap_com
    todo = [rap_com]
    if e:
        todo = exist_check('rapsearch', rap_out, todo)
    if (mode == 'run') and (len(todo) != 0):
        system(rap_com)


[docs]def run_megan(out_dir, m8, contigs, tax_gi_file='/home/pszczesny/workingdata/mapping/gi_taxid_prot.dmp'):
    """Makes and launches scripts for MEGAN

    Args:
        out_dir:
            output directory

        m8:
            list of m8 files

        contigs:
            contigs file

        tax_gi_file:
            taxGIFile

    HARDCODED:
        default path to taxGIFile
    """
    print "MEGAN"
    if out_dir=='in_situ': out_dir = getcwd()
    for rma in [(".lite.rma","1","lite"),(".rma","0","just")]:
        script = pjoin(out_dir,rma[-1] + '_megan_script.txt')
        with open(script, 'w') as f:
            f.write("set useParseTextTaxonomy=false;\n")
            f.write("load taxGIFile='" + tax_gi_file + "';\n")
            f.write("set taxUseGIMap=true;\n")
            for file_ in m8:
                megan_file = pjoin(out_dir, 'megan' + rma[0])
                f.write("import blastFile='" + file_ + "' ")
                f.write("fastaFile='" + contigs + "' ")
                f.write("meganFile='" + megan_file +"' ")
                f.write(
                    "maxMatches=100 minScore=50.0 maxExpected=0.01 " +
                    "topPercent=10.0 minSupport=50 minComplexity=0.44 " +
                    "useMinimalCoverageHeuristic=false useSeed=true " +
                    "useCOG=true useKegg=true paired=false " +
                    "useIdentityFilter=false textStoragePolicy=" + rma[1] +
                    " blastFormat=BlastTAB mapping='Taxonomy:GI_MAP=true';\n"
                )
            f.write("quit;")
        system("xvfb-run --auto-servernum --server-num=1 " + PATH_MEGAN + " -g -E -c " + script)


[docs]def MV(mode, e, global_out_dir, k_mers, cat, pair, ins_len, rap=False, megan=False):
    """Runs Velvet. Runs :func:`gzip_MV` function on folder with Velvet results.

    The name of the folder is derived from `pair` argument::

        pair_uni_name(pair) + '_velvh_out'

    The function can also run :func:`rapsearch` and :func:`megan` functions
    when called with appropriate arguments.

    Args:
        mode: if `mode` is 'run' program runs velveth or velvetg or metavelvetg
        e: if true, then program changes todo list with :func:`exist_check`: function
        k_mers: k-length nucleotides reads list
        cat: name of current folder
        pair: tuple of paired_end reads
        ins_len:
            if `9999` given, then `ins_len` will be retrieved
            as an output of :func:`ins_len_read` function
        rap: if true, then program runs :func:`rapsearch` function.
        megan: if true, then program runs :func:`megan` function.

    GLOBALS:
        PATH_VELVETH
        PATH_VELVETG
        PATH_METAVELVETG
    """
    fileext = '.'.join(split(pair[0], '.')[1:])
    if ins_len == 9999:
        ins_len = ins_len_read(pair, cat)
    out_dir_local = pair_uni_name(pair) + '_velvh_out'
    out_dir = pjoin(cat, out_dir_local)
    if len(k_mers) == 1:
        k_min = int(k_mers[0])
        k_max = int(k_min)+1
        k_step = 2
    elif len(k_mers) == 2:
        k_min = int(k_mers[0])
        k_max = int(k_mers[1])
        k_step = 2
    elif len(k_mers) == 3:
        k_min = int(k_mers[0])
        k_max = int(k_mers[1])
        k_step = int(k_mers[2])
    else:
        k_min = 31
        k_max = 32
        k_step = 1
    if k_min > k_max:
        k_min, k_max = k_max, k_min
    elif k_min == k_max:
        k_max += 1
    else:
        pass
    for idx in xrange(k_min, k_max, k_step):
        todo = ['velveth', 'velvetg', 'meta']
        tmp_out_dir = out_dir + '_k-mer_' + str(idx)
        log_loc = tmp_out_dir + '/logfile'
        velveth_run = PATH_VELVETH+' %s %i -%s -shortPaired %s %s' % (
            tmp_out_dir, idx, fileext, pjoin(cat, pair[0]), pjoin(cat, pair[1])
        )
        velvetg_run = PATH_VELVETG+' %s -exp_cov auto -ins_length %i'%(tmp_out_dir, ins_len)
        meta_run = PATH_METAVELVETG+' %s -ins_length %i | tee %s'%(tmp_out_dir, ins_len, log_loc)
        print velveth_run, '\n', velvetg_run, '\n', meta_run
        if e:
            file_list = [
                'meta-velvetg.contigs.fa',
                'intermediates.tgz',
                'Sequences',
                'Roadmaps',
                'Log',
                'contigs.fa',
                'Graph2',
                'LastGraph',
                'stats.txt'
            ]
            file_list = [pjoin(tmp_out_dir, x) for x in file_list]
            todo = exist_check('MV', file_list, todo)
        if mode == 'run':
            if 'velveth' in todo:
                system(velveth_run)
            if 'velvetg' in todo:
                system(velvetg_run)
            if 'meta' in todo:
                system(meta_run)
            gzip_MV(tmp_out_dir)
        if rap:
            rap_in = pjoin(tmp_out_dir, 'meta-velvetg.contigs.fa')
            rap_f = pair_uni_name(pair) + '.rapsearch'
            rap_out = pjoin(tmp_out_dir, rap_f)
            rapsearch(mode, e, rap_in,  rap_out)
    if megan and mode == 'run':
        m8_dict = cat_read(mode, 'm8', False)
        m8 = []
        for dir_, files in m8_dict.items():
            for file_ in files:
                m8.append(pjoin(dir_, file_))
        if (not e) or (len(glob(pjoin(global_out_dir, '*lite.rma'))) >= len(m8)):
            run_megan(global_out_dir, m8, pjoin(tmp_out_dir, 'meta-velvetg.contigs.fa'))


[docs]def MH(mode, e, glob_out_dir, t, cat, pair, rap=False, presets='meta-large', megan=False):
    """Runs Megahit (also :func:`rapsearch` and :func:`run_megan`)

    Args:
        mode:
            if `mode` is other than 'run' program prints
            commands without executing them
        e:
            if true, then program updates todo list
            with :func:`exist_check` function.
        t:
            number of threads
        cat:
            name of current folder
        pair:
            tuple of paired_end reads
        rap:
            if true, program runs :func:`rapsearch` function.
        presets:
            argument for megahit
        megan:
            if true, program runs :func:`megan` function.

    GLOBAL:
        PATH_MEGAHIT
    """
    out_dir = pjoin(getcwd(), pair_uni_name(pair) + '_megahit_out')
    mega_hit_run = (PATH_MEGAHIT + ' -t ' + str(t) + ' -1 ' + pair[0] + ' -2 ' +
        pair[1] + ' -o ' + out_dir + ' --presets ' + presets)
    print mega_hit_run
    if mode == 'run':
        system(mega_hit_run)
        system('rm -r ' + pjoin(out_dir, 'intermediate_contigs'))
    if rap:
        rap_in = pjoin(out_dir, 'final.contigs.fa')
        rap_f = pair_uni_name(pair) + '.rapsearch'
        rap_out = pjoin(out_dir, rap_f)
        rapsearch(mode, e, rap_in,  rap_out)
    if megan and mode == 'run':
        m8_dict = cat_read(mode, 'm8', False)
        m8 = []
        for dir_, files in m8_dict.items():
            for file_ in files:
                m8.append(pjoin(dir_, file_))
        if (not e) or (len(glob(pjoin(glob_out_dir, '*lite.rma')))>=len(m8)):
            run_megan(glob_out_dir, m8, pjoin(out_dir,'final.contigs.fa'))


[docs]def usearch(mode, e, search_type, infile, database, threads):
    """Launches Usearch with -usearch_local command.

    HARDCODED:
        - Usearch options:
            - evalue = 0.01
            - id = 0.9
            - strand = both

    Args:
        mode:
            if `mode` is 'run' Usearch will be launched.
        e:
            if true, checks if a part of workflow is actually done and doesn't
            duplicate this jobs.

            For more information, please refer to ``e`` argument
            in :func:`exist_check` function.

        search_type: &
        infile: a name of input file for usearch.
            Also determines name of [-blast6out in Usearch] output file.

        database:    [-db in Usearch]
        threads:     [-threads in Usearch] Number of threads used in
                     calculations.

    GLOBALS:
        PATH_USEARCH
    """
    outfile = infile + '.usearch_' + search_type
    usearch_command = '%s -usearch_local %s -db %s -evalue 0.01 -id 0.9 -blast6out %s -strand both -threads %i'%(
        PATH_USEARCH, infile, database, outfile, threads
        )
    todo = ['usearch']
    print usearch_command
    if e:
        todo = exist_check('usearch', outfile, todo)
    todo = exist_check('usearch_0', infile, todo)
    if (mode == 'run') and (len(todo) != 0):
        system(usearch_command)


[docs]def adapter_read_bck(adapter_file, filename):
    """Function reads all lines of given `adapter_file` and separates words
    in each line. If first word in a line is equal to filename, then second
    and third words are returned as a tuple.

    Args:
        adapter_file, filename

    Returns:
        tuple (fin_adap_1, fin_adap_2)

    """
    with open(adapter_file, 'r') as plik:
        content = plik.readlines()
        for linia in content:
            if linia:
                fastqname, adap1, adap2 = split(linia)
                if fastqname == filename:
                    fin_adap_1, fin_adap_2 = adap1, adap2
                else:
                    print filename, fastqname
                    pass
    return fin_adap_1, fin_adap_2


[docs]def adapter_read(filename):
    """Replace 'NNNNNN' part of adp_2 by this part of the filename,
     which contains only letters that are symbols of nucleotides.

    Example:

        input::

            Amp18_BFp_B_p_GACGAC_L001_R1_001.fastq

        output::

            (
                'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT',
                'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG'
            )

    HARDCODED:
        Adapter sequences
    """
    adp_1 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
    adp_2 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG'
    nameparts = split(filename, '_')
    nucleotides = set('ACTG')
    identifier = [item for item in nameparts if not set(item).difference(nucleotides)][0]
    adp_2 = adp_2.replace('NNNNNN', identifier)
    return adp_1, adp_2


[docs]def cutadapt(mode, e, cat, R1_file, R2_file, adapter_file, usearch_16S=False, usearch_ITS=False, threads=False):
    """Performs preparation of reads to be used in usearch in three steps:

    1. cutadapt - searches for the adapters in reads from input files,
       removes them, and creates output files with ``.cutadapt.fastq`` extension.
    2. runs FLASH (software tool to merge paired-end reads) with those files
       and gets ``.amplicons.cutadapt.flash.merged.fastq`` files from output.
    3. these results are input for fastq_to_fasta which
       converts file format from fastq to fasta

    Optionally usearch can be run on files prepared this way.

    Args:
        mode:
            if mode is 'run', function operate on data
        e:
            if true, checks if a part of the workflow is actually done
            and omits these parts, to avoid duplicating this job.
        cat:
            name of folder with R_1 and R_2 files
        R1_file, R2_file:
            input files
        adapter_file:
            If 'use_filenames', the function will use adapters returned by
            `adapter_read(R1_file)``. Else, function will use adapters
            returned by ``adapter_read_bck(adapter_file, R1_file)``
        usearch_16S:
            if true runs
            ``usearch(mode, e, '16S', outname_uni_fasta, usearch_16S, threads)``
            where `outname_uni_fasta` is cutadapt output.
        usearch_ITS:
            if true runs
            ``usearch(mode, e, 'ITS', outname_uni_fasta, usearch_ITS, threads)``
            where `outname_uni_fasta` is cutadapt output.

     See Also:
        Follwoing functions may provide you with better background.
        - :func:`adapter_read_bck`
        - :func:`adapter_read`

    GLOBALS:
        PATH_FQ2FA
    """
    R1_fastq = pjoin(cat, R1_file)
    R2_fastq = pjoin(cat, R2_file)
    if adapter_file == 'use_filenames':  # WARNING HARDCODE
        adapter_1, adapter_2 = adapter_read(R1_file)
    elif adapter_file == 'use_paths':
        adapter_1, adapter_2 = adapter_read(R1_fastq)
    else:
        adapter_1, adapter_2 = adapter_read_bck(adapter_file, R1_file)
    outname_1 = '.'.join(split(R1_fastq, '.')[:-1]) + '.cutadapt.fastq'
    outname_2 = '.'.join(split(R2_fastq, '.')[:-1]) + '.cutadapt.fastq'
    outname_uni_fastq_preflash = pair_uni_name([R1_file]) + '.amplicons.cutadapt.flash.merged.fastq'
    outname_uni_fastq_postflash = outname_uni_fastq_preflash + '.extendedFrags.fastq'
    outname_uni_fasta = outname_uni_fastq_postflash[:-1] + 'a'
    names = [(outname_1, R1_fastq), (outname_2, R2_fastq)]
    outnames = [outname_1, outname_2, outname_uni_fastq_postflash, outname_uni_fasta]
    todo = ['cutadapt0', 'cutadapt1', 'flash', 'fq2fa']
    if e:
        todo = exist_check('cutadapt', outnames, todo)
    for idx in xrange(len(names)):
        cut_com = 'cutadapt -b %s -b %s -o %s %s' % (
            adapter_1, adapter_2, names[idx][0], names[idx][1]
        )
        print cut_com
        if (mode == 'run') and ('cutadapt' + str(idx) in todo):
            system(cut_com)
    flash_com = 'flash %s %s -o %s' % (
        outname_1, outname_2, outname_uni_fastq_preflash
    )
    print flash_com
    if 'flash' in todo and mode == 'run':
        system(flash_com)
    fq2fa_com = '%s < %s > %s -Q33' % (
        PATH_FQ2FA, outname_uni_fastq_postflash, outname_uni_fasta
    )
    print fq2fa_com
    if 'fq2fa' in todo and mode == 'run':
        system(fq2fa_com)
    if usearch_16S:
        usearch(mode, e, '16S', outname_uni_fasta, usearch_16S, threads)
    if usearch_ITS:
        usearch(mode, e, 'ITS', outname_uni_fasta, usearch_ITS, threads)


[docs]def auto_tax_read(db_loc):
    """Read ``{GI:TaxID} & {TaxID:scientific_name}`` dicts from pickle.

    Args:
        db_loc: path to pickle file

    Returns:
        Two dicts:
          ``{GI:TaxID}``              Keys are integers, values are strings
          ``{TaxID:scientific_name}`` Keys and values are integers
    """
    with open(db_loc, 'rb') as fp:
        tax_names = pickle.load(fp)
        tax_id = pickle.load(fp)
    return tax_id, tax_names


[docs]def taxa_read(read_mode, db_loc=None):
    """Returns ``{GI:TaxID} & {TaxID:scientific_name}`` dicts.

    Args:
        read_mode:
            if read_mode is equal to 'manual', then:
                :func:`tax_id_reader` and :func:`tax_name_reader` functions
                will be used and data is read from text files.
                Refer to mentioned functions for more information.
            otherwise:
                :func:`auto_tax_read()` function (with `db_loc` as an argument)
                will be used and data will be read from a pickle file.
                Refer to mentioned function for more information.

        db_loc:
            path to pickle file, used when `read_mode` is other than 'manual'

    Returns:
        Two dicts:
          - ``{GI:TaxID}``              Keys are integers, values are strings.
          - ``{TaxID:scientific_name}`` Keys and values are integers.
    """
    if read_mode == 'manual':
        tax_id_start = datetime.now()
        tax_id_dict = tax_id_reader()
        tax_name_start = datetime.now()
        print 'tax_ids read in ', str(tax_name_start - tax_id_start)
        tax_name_dict = tax_name_reader()
        tax_name_end = datetime.now()
        print 'tax_names read in ', str(tax_name_end - tax_name_start)
    else:
        tax_id_dict, tax_name_dict = auto_tax_read(db_loc)
    return tax_id_dict, tax_name_dict


[docs]def reconstruct(mode, thr, e, pair, cat, prefix, rec_db_loc):
    """Runs bwa. Find the SA coordinates of the input reads.
    Generate alignments in the SAM format given single-end reads.
    Repetitive hits will be randomly chosen.

    Args:
        mode: if 'run', then commands "bwa aln" and "bwa samse" were run
        thr: Number of threads (multi-threading mode)
        e: (!!!! Wasn't used !!!!)
        pair: tuple of paired_end read
        cat: name of folder with the sample files
        prefix: prefix for output files names
        rec_db_loc: input database with fasta files
    """
    tmp_loc = '/tmp/' + '_'.join(str(datetime.now()).split())
    mkdir = "mkdir %s"%(tmp_loc)
    fq1 = pjoin(cat, pair[0])
    fq2 = pjoin(cat, pair[1])
    reads_fastq = pjoin(tmp_loc, 'reads.fastq')
    merge = "cat %s %s > %s"%(fq1, fq2, reads_fastq)
    reads_sai = pjoin(tmp_loc, 'reads.sai')
    bwa_aln_com = "bwa aln -t %i %s %s > %s"%(thr, rec_db_loc, reads_fastq, reads_sai)
    alnsam = pjoin(tmp_loc, 'aln.sam')
    bwa_samse_com = "bwa samse %s %s %s > %s"%(rec_db_loc, reads_sai, reads_fastq, alnsam)
    alnbam = pjoin(tmp_loc, 'aln.bam')
    alnsrt = pjoin(tmp_loc, 'aln.sorted')
    alnsrtbam = pjoin(tmp_loc, 'aln.sorted.bam')
    if prefix[-1] != '_':
        prefix += '_'
    alnfastq = pjoin(cat, prefix + 'aln.fastq')
    pile_com = "samtools mpileup -uf %s %s |bcftools view -cg - |vcfutils.pl vcf2fq > %s"%(rec_db_loc, alnsrtbam, alnfastq)
    rm_tmp = "rm -rf %s"%(tmp_loc)

    print "%s" % (mkdir)
    system("%s" % (mkdir))
    print "%s" % (merge)
    print "%s" % (bwa_aln_com)
    print "%s" % (bwa_samse_com)
    if mode == 'run':
        system("%s" % (merge))
        system("%s" % (bwa_aln_com))
        system("%s" % (bwa_samse_com))
    bam_make(mode, alnsam, alnbam)
    bam_sorting(mode, alnbam, alnsrt)
    bam_indexing(mode, alnsrtbam)
    print "%s" % (pile_com)
    if mode == 'run':
        system("%s" % (pile_com))
    print "%s" % (rm_tmp)
    system("%s" % (rm_tmp))


[docs]def humann(mode, e, m8_dict, typ='m8'):
    """Copies humann to the current directory, moves input (\*.m8) files to
    the input directory, copies hmp_metadata.dat file to the input directory
    and runs humann.

    GLOBALS:
        - path to humann program: PATH_HUMANN
        - path to data for humann: PATH_HUMANN_DATA

    Args:
        mode: if 'run', then humann will be copied to the current directory
        e: if true, then function checks existence of humann-0.99 results folder
        m8_dict: the similarty search results folders
        typ:
            default typ="m8", in that case new catalog humann-0.99 will be
            created in rapsearch result folder; in other case humann analysis
            results will be added in rapsearch result folder.
    """
    for path in m8_dict:
        flag = True
        if typ == 'm8':
            hum_loc = pjoin(path, 'humann-0.99')
        else:
            hum_loc = path
            flag = None
        if e:
            if pexists(hum_loc):
                print 'Found %s'%(hum_loc)
                flag = None
        if flag:
            get_humann = 'cp -r %s %s' % (PATH_HUMANN, path)
            print get_humann
            if mode == 'run':
                system(get_humann)
        input_loc = pjoin(hum_loc, 'input')
        if typ == 'm8':
            chdir(hum_loc)
        else:
            curr_loc = hum_loc[:-6]
            chdir(curr_loc)
        print 'I am in here: %s' % (getcwd())
        if flag:
            mv_m8 = 'mv %s %s'%(pjoin(path, m8_dict[path][0]), pjoin(input_loc, m8_dict[path][0][:-3].replace('_', '-')))
            print mv_m8
            clean_comm = 'rm -rf %s/*'%(input_loc)
            print 'Executing: %s'%(clean_comm)
        if typ == 'm8':
            hmp_get_comm = 'cp %s %s/'%(PATH_HUMANN_DATA, input_loc)
        else:
            hmp_get_comm = 'cp %s %s/'%(PATH_HUMANN_DATA, hum_loc)
        print 'Getting hmp: %s'%(hmp_get_comm)
        humann_comm = 'scons 1>log 2>errlog'
        print 'Executing: %s'%(humann_comm)
        if mode == 'run':
            if flag:
                system(clean_comm)
                system(mv_m8)
            system(hmp_get_comm)
            system(humann_comm)


[docs]def sample(opts):
    """Runs programs such as Velvet, bwa, Bowtie 2, humann;
    cuts adapters or run RAPSearch if these options were chosen.

    Args:
        opts: A namespace, where:

            opts.to_calculate:
                Input sequences type (Plant or fungi).
                TODO (this description seems to be misleading)

            opts.mode:
                A mode in which the program will be run.
                One of: ['test', 'run'].

            opts.db_NCBI_taxonomy:
                Path to pickle file.

            opts.reconstruct:
                Determines if :func:`reconstruct` function should be run.

            opts.assembler:
                Assembly program
                One of: ['MH','MV'].

            opts.MV:
                k-length nucleotids reads list or None.

            opts.db_reconstruct:
                Input database with fasta files.

            opts.threads:
                Number of threads (multi-threading mode).

            opts.e:
                If true, check if file already exists.

            opts.ins_len:
                200 if read length is less than 200 and 500 in otherwsie.

            opts.db_refseq_fungi:
                A list of paths to refseq databases to use in fungi analysis.
                Up to two paths are allowed.

                Warning:
                If there are multiple databases with filenames like
                <your_path><second_part_of_name>, all will be loaded.

            opts.db_refseq_plant:
                A list of paths to refseq databases to use in plants analysis.
                Up to two paths are allowed.
                Also check warning in opts.db_refseq_fungi description.

            opts.our_dir:
                Directory where sums of mapped and unmapped
                reads will be written

            opts.postfix:
                TODO

            opts.cutadapt:
                List of adapter types: '16S', 'ITS' or 'both'.

            opts.db_16S:
                database of 16S adapters, which will be -db parameter for
                Usearch program; if it is not empty, then cutadapt function`s
                parameter usearch_16S will be True.__truediv__

            opts.db_ITS:
                database of ITS adapters, which will be -db parameter for
                Usearch program; if it isn't empty,then cutadapt function`s
                parameter usearch_ITS will be True.

    GLOBALS:
        PATH_FQ2FA

    HARDCODED:
        actions dependent on existence of 'seq' substring in some paths.
    """
    if len(opts.db_refseq_plant) == 1:
        opts.db_refseq_plant += [False]
    if len(opts.db_refseq_fungi) == 1:
        opts.db_refseq_fungi += [False]

    if opts.to_calculate is None:
        opts.to_calculate = []
    refseq_test = len({'f', 'b', 'p'} & set(opts.to_calculate))
    if (refseq_test > 0) and opts.mode == 'run':
        if pexists(opts.db_NCBI_taxonomy):
            tax_id_dict, tax_name_dict = taxa_read('auto', opts.db_NCBI_taxonomy)
        else:
            tax_id_dict, tax_name_dict = taxa_read('manual')
    else:
        tax_id_dict = tax_name_dict = {}
    fastq_dict = cat_read(opts.mode, 'fastq')
    pairs = []  # Fast, but ugly fix. I'm sorry.
    for cat in fastq_dict:
        for pair in fastq_dict[cat]:
            pairs.append(pair)
    for i in range(len(pairs)):
        pair = pairs[i]
        if opts.reconstruct:
            reconstruct(opts.mode, opts.threads, opts.e, pair, cat, opts.reconstruct, opts.db_reconstruct)
        if opts.assembler == "MH":
            MH(opts.mode, opts.e, opts.out_dir, opts.threads, cat, pair, ('rap_prot' in opts.to_calculate),
                'meta-large', ('MEGAN' in opts.to_calculate) and (i==len(pairs)-1))
        elif opts.assembler == "MV":
            MV(opts.mode, opts.e, opts.out_dir, opts.MV, cat, pair, opts.ins_len, ('rap_prot' in opts.to_calculate),
                ('MEGAN' in opts.to_calculate) and (i==len(pairs)-1))
        if ('f' in opts.to_calculate) or ('b' in opts.to_calculate):
            postfix = opts.postfix + 'fungi'
            refseq_mapping(
                opts.mode, opts.e, cat, pair, postfix, opts.db_refseq_plant[0], tax_name_dict,
                tax_id_dict, opts.threads, opts.out_dir, opts.db_refseq_plant[1]
                )
        if ('p' in opts.to_calculate) or ('b' in opts.to_calculate):
            postfix = opts.postfix + 'plant'
            refseq_mapping(
                opts.mode, opts.e, cat, pair, postfix, opts.db_refseq_fungi[0], tax_name_dict,
                tax_id_dict, opts.threads, opts.out_dir, opts.db_refseq_fungi[1]
                )
        if opts.cutadapt != '':
            if ('both' in opts.cutadapt) or (('16S' in opts.cutadapt) and ('ITS' in opts.cutadapt)):
                cutadapt(opts.mode, opts.e, cat, pair[0], pair[1], opts.cutadapt[0],
                    opts.db_16S, opts.db_ITS, opts.threads
                    )
            elif '16S' in opts.cutadapt:
                cutadapt(opts.mode, opts.e, cat, pair[0], pair[1], opts.cutadapt[0],
                    opts.db_16S, False, opts.threads
                    )
            elif 'ITS' in opts.cutadapt:
                cutadapt(opts.mode, opts.e, cat, pair[0], pair[1], opts.cutadapt[0],
                    False, opts.db_ITS, opts.threads
                    )
            else:
                cutadapt(opts.mode, opts.e, cat, pair[0], pair[1], opts.cutadapt[0])

    if opts.assembler is not None:
        contig_dict = cat_read(opts.mode, 'fa', False)
        for cat in contig_dict:
            for contig in contig_dict[cat]:
                if ('f' in opts.to_calculate) or ('b' in opts.to_calculate):
                    postfix = opts.postfix + 'fungi'
                    refseq_mapping(
                        opts.mode, opts.e, cat, contig, postfix, opts.db_refseq_plant[0], tax_name_dict,
                        tax_id_dict, opts.threads, opts.out_dir, opts.db_refseq_plant[1]
                        )
                if ('p' in opts.to_calculate) or ('b' in opts.to_calculate):
                    postfix = opts.postfix + 'plant'
                    refseq_mapping(
                        opts.mode, opts.e, cat, contig, postfix, opts.db_refseq_fungi[0], tax_name_dict,
                        tax_id_dict, opts.threads, opts.out_dir, opts.db_refseq_fungi[1]
                        )
    else:
        if 'rap_prot' in opts.to_calculate:
            contig_dict = cat_read(opts.mode, 'contigs.fa', False)
            for cat in contig_dict:
                for contig in contig_dict[cat]:
                    rap_in = pjoin(cat, contig)
                    rap_out = pjoin(cat, split(cat, '/')[-1] + '.rapsearch')
                    rapsearch(opts.mode, opts.e, rap_in,  rap_out)

    if ('rap_KEGG' in opts.to_calculate) or ('rap_KO' in opts.to_calculate):
        fastq_dict = cat_read(opts.mode, 'fastq')
        fasta_dict = {}
        for path in fastq_dict:
            fasta_dict[path] = []
            for fastq_pair in fastq_dict[path]:
                for fastq in fastq_pair:
                    fasta = fastq[:-1] + 'a'
                    flag = True
                    if opts.e:
                        if pexists(pjoin(path, fasta)):
                            print 'Found %s'%(pjoin(fasta))
                            flag = None
                    if flag:
                        fq2fa_com = '%s < %s > %s -Q33'%(PATH_FQ2FA, pjoin(path, fastq), pjoin(path, fasta))
                        print fq2fa_com
                        if opts.mode == 'run':
                            system(fq2fa_com)
                    fasta_dict[path].append(fasta)
        rap_paths = []
        for path in fasta_dict:
            # WARNING!!! HARDCODE
            if 'seq' in path:
                new_path = '_'.join(split(path, '_')[:-1])
                if pexists(new_path):
                    pass
                else:
                    system('mkdir %s'%(new_path))
            else:
                new_path = path
            rap_in =  pjoin(new_path, 'tmp.fasta')
            rap_out = pjoin(new_path, split(new_path, '/')[-1]+'.txt')
            rap_paths.append((rap_in, rap_out))
            for fasta in fasta_dict[path]:
                flag = True
                if opts.e:
                    if pexists(rap_in):
                        print 'Found %s'%(rap_in)
                        flag = False
                if flag:
                    cat_com = 'cat %s >> %s'%(pjoin(path, fasta), rap_in)
                    print cat_com
                    if opts.mode == 'run':
                        system(cat_com)
        if 'rap_KEGG' in opts.to_calculate:
            kegg_db = 'masl'
        else:
            kegg_db = 'KO'
        for pair in rap_paths:
            rapsearch(opts.mode, opts.e, pair[0], pair[1], kegg_db)

    if 'humann' in opts.to_calculate:
        m8_dict = cat_read(opts.mode, 'txt.m8', False)
        if len(m8_dict) > 0:
            humann(opts.mode, opts.e, m8_dict)
        else:
            txt_dict = cat_read(opts.mode, 'txt', False)
            tmp_dict = txt_dict.copy()
            for key in txt_dict:
                if 'input' not in key:
                    del tmp_dict[key]
            humann(opts.mode, opts.e, tmp_dict, 'nom8')

    if '16S' in opts.to_calculate or 'ITS' in opts.to_calculate:
        fasta_dict = cat_read(opts.mode, 'fa', False)
        for cat in fasta_dict:
            for fasta in fasta_dict[cat]:
                if '16S' in opts.to_calculate:
                    usearch(opts.mode, opts.e, '16S', pjoin(cat, fasta), opts.db_16S, opts.threads)
                if 'ITS' in opts.to_calculate:
                    usearch(opts.mode, opts.e, 'ITS', pjoin(cat, fasta), opts.db_ITS, opts.threads)


[docs]def SSU_read(loc, headers_type='ITS'):
    """Extracts from specially formatted FASTA file taxonomic data and returns
    them as hierarchically organised dict.

    Headers of FASTA files should follow schemas presented by following examples:

        - ITS format (used in UNITE database)::

            >DQ233785|uncultured ectomycorrhizal fungus|Fungi|Thelephora terrestris|Fungi; Basidiomycota; Agaricomycotina; Agaricomycetes; Incertae sedis; Thelephorales; Thelephoraceae; Thelephora; Thelephora terrestris

        - 16S format (alternative)::

            >AF093247.1.2007 Eukaryota;Amoebozoa;Mycetozoa;Myxogastria;;Hyperamoeba_sp._ATCC50750

    Args:
        loc: location - path to the FASTA file

        headers_type: tells which format of header is present in given file.
            If specified, indicates use of alternative format.

    Returns:
        A dict with taxonomic data, where
            - keys are sequence identifiers;
            - values are lists of taxonomic terms, in order:
              from the most generic to the most specific one.

        Example::

            {
                'DQ482017':
                [
                    'Fungi',
                    'Basidiomycota',
                    'Agaricomycotina',
                    'Agaricomycetes',
                    'Incertaesedis',
                    'Thelephorales',
                    'Thelephoraceae',
                    'Tomentella',
                    'Tomentellasublilacina'
                ],
                'EF031133':
                [
                    'Fungi',
                    'Basidiomycota',
                    'Agaricomycotina',
                    'Agaricomycetes',
                    'Incertaesedis',
                    'Thelephorales',
                    'Thelephoraceae',
                    'Thelephora',
                    'Thelephoraterrestris'
                ]
            }

    """
    tax_dict = {}
    with open(loc, 'r') as fasta_file:
        for linia in fasta_file.readlines():
            if linia[0] == '>':
                linia = linia.rstrip()
                if headers_type == '16S':
                    linia = linia.split()
                    tax_idx = linia[0][1:]
                    tax_dict[tax_idx] = linia[1].split(';')
                elif headers_type == 'ITS':
                    line = linia.split('|')
                    tax_idx = line[0][1:]
                    if linia.count('Fungi') == 2:
                        tax_line = line[-1].replace(" ", "")
                    elif linia.count('Fungi') == 1:
                        tax_line = line[2].replace(" ", "")
                        if len(tax_line.split(';')) == 1:
                            tax_line = line[-1].replace(" ", "")
                    else:
                        tax_line = ""
                    if tax_line == '-':
                        tax_line = 'Fungi'+';' + linia.split('|')[1].replace(" ", "_")
                    tax_dict[tax_idx] = tax_line.split(';')
                else:
                    raise KeyError('Unknown headers_type %s' % headers_type)
    return tax_dict


[docs]def tuple_to_dict(tuple_dict):
    """ Input dict has tuples as keys and int type values.
    In the dict returned by function:

       - keys are elements from input dict's tuples,
       - values are dicts.

    Args:
        tuple_dict:
            a dict of tuples

            Example::

                {('first','second','third'): 10, ('first','second'): 30, ...}

    Example will show why this function can be useful:

    input::

        {
            ('Animalia','Mammalia','Canis lupus familiaris'): 10,
            ('Bacteria','Enterobacteriaceae','Escherichia coli'): 20,
            ('Animalia','Mammalia','Felis catus'): 30
        }

    output::

        {
            'Animalia':
            {
                'Mammalia':
                {
                    'Canis lupus familiaris': {'subsum': 10},
                    'Felis catus': {'subsum': 30}
                }
            }
            'Bacteria':
            {
                'Enterobacteriaceae':
                {
                    'Escherichia coli': {'subsum': 20}
                }
            }
        }
    """
    fin_dict = {}
    for key in tuple_dict.keys():
        curr_dict = fin_dict
        val = tuple_dict[key]
        for i in xrange(len(key)):
            if key[i] in curr_dict:
                if i == len(key)-1:
                    if 'subsum' in curr_dict[key[i]]:
                        curr_dict[key[i]]['subsum'] += val
                    else:
                        curr_dict[key[i]]['subsum'] = val
                else:
                    curr_dict = curr_dict[key[i]]
            else:
                if i == len(key)-1:
                    curr_dict[key[i]] = {'subsum': val}
                else:
                    curr_dict[key[i]] = {}
                    curr_dict = curr_dict[key[i]]
    return fin_dict


[docs]def tuple_to_xml_dict(tuple_dict):
    """Input dict has tuples as keys and int type values.
    In dict returned by function all elements from input dict's
    tuples are single keys but their current values
    are the sum of the values of all tuples, in which they were.

    Args:
        tuple_dict: tuple to be modified

            Example::

                {
                    ('first','second','third'): 10,
                    ('first','third','fifth': 30)
                    ...
                }

    Example:
        input::

            {
                ('Animalia','Mammalia','Canis lupus familiaris'): 10,
                ('Bacteria','Enterobacteriaceae','Escherichia coli'): 20,
                ('Animalia','Mammalia','Felis catus'): 30
            }

        output::

            {
                'Animalia': 40,
                'Felis catus': 30,
                'Enterobacteriaceae': 20,
                'Mammalia': 40,
                'Escherichia coli': 20,
                'Bacteria': 20,
                'Canis lupus familiaris': 10
            }
    """
    fin_dict = {}
    for taxonomy in tuple_dict:
        for level in taxonomy:
            if level in fin_dict.keys():
                fin_dict[level] += tuple_dict[taxonomy]
            else:
                fin_dict[level] = tuple_dict[taxonomy]
    return fin_dict


[docs]def dict_purify(bac_dict):
    """Removes from given dict values below the hardcoded threshold.

    Args:
        bac_dict: a dict.

    Returns:
        A new dict without entries, within values were lower than threshold.

    HARDCODED:
        threshold = 10
    """
    threshold = 10
    all_keys = bac_dict.keys()
    to_remove = []
    for key in all_keys:
        if bac_dict[key] < threshold:
            to_remove.append(key)
    for key in to_remove:
        del bac_dict[key]
    return bac_dict


[docs]def file_analysis(typ, name, SSU=None):
    """Using given SSU as database, performs 'statistical' analysis
    of taxonomy from file `name`. It counts occurrences of different
    species in given file and returns result in form of two dicts.

    The threshold hardcoded in dict_purify is used,
    to remove neglectable entities.

    Args:
        typ: an "output type" - typically: 'ITS', '16S' or 'txt'.

        name: name of file to analise

        SSU: A dict with taxonomic data, where:
                - keys are sequence identifiers;
                - values are lists of taxonomic terms, in order:
                  from the most generic to the most specific one.

            Explicit examples included in description of :func:`SSU_read` function.

    Returns:
        A 2-tuple:

            If file of name <name> was not found:
                ('NA', 'NA')
            If data of type '16S' was successfully analysed:
                (full_bac_dict, tax_bac_dict)
            If data of type 'ITS' was successfully analysed:
                (full_fun_dict, tax_fun_dict)

        full_bac_dict / full_fun_dict: a dict of dicts.
            It represents taxonomic tree. Generally in this dict:
                - keys are names of taxa,
                - values are nested dicts of the same type.

            Values in the deepest levels ("leafs") are counts of occurrences.

            Example::

                {
                    'Bacillaceae':
                    {
                        'Anoxybacillus': {'subsum': 15}
                    },
                    'Listeriaceae':
                    {
                        'Listeria':
                        {
                            'Lgrayi': {'subsum': 22},
                            'Linnocua': {'subsum': 11}
                        }
                    }
                }

        tax_bac_dict / tax_fun_dict: a dict, with taxonomic statistics:
            - keys are names of taxa
            - values are counts of occurrences

            Example::

                {
                    'Bacillaceae': 15,
                    'Anoxybacillus': 15,
                    'Listeriaceae': 33,
                    'Listeria': 33,
                    'Lgrayi': 22,
                    'Linnocua': 11
                }

    HARDCODED:
        In '16S' analysis, if spec is 'Phaseolus_acutifolius_(tepary_bean)',
        then it is not counted neither as bacteria nor archaea.

    TODO:
        There was a suggestion, that there might be something wrong with handling
        of 'name' argument, when passed in only file_analysis call present in
        this file. The issue is linked to "full path or basename?" question.
        It should be investigated, in fully functional testing environment.
    """
    if not pexists(name):
        return 'NA', 'NA'
    else:
        with open(name, 'r') as content:
            linie = content.readlines()
            # I have commented following block as a result of
            # discussion on slack with siwek (Michal)
            # if typ in ['fmc', 'pmc']:
            #     mapped, unmapped = linie[0].strip().split('-')
            #     try:
            #         mapped = int(mapped)
            #     except:
            #         mapped = 'NA'
            #     try:
            #         unmapped = int(unmapped)
            #     except:
            #         unampped = 'NA'
            #     if mapped == 'NA' or unmapped == 'NA':
            #         total = 'NA'
            #     else:
            #         total = mapped + unmapped
            #     return mapped, total
            # if typ == 'log':
            #     data = linie[-1].split(' ')
            #     try:
            #         n50 = int(data[8][:-1])
            #         total = int(data[12][:-1])
            #         using = split(data[14], '/')[0]
            #     except:
            #         n50 = total = using = 'NA'
            #     return n50, total, using
            if typ == '16S':
                bac_arch = [0, 0]   # counts: 0 - of bacteria 1 - of archaea
                bac_dict = {}   # counts species/strains of bacteria
                for linia in linie:
                    tax_id = linia.split('\t')[1]
                    cult_control = 0
                    spec_idx = -1
                    while cult_control == 0:
                        if SSU[tax_id][0] == 'Eukaryota':
                            cult_control = 1
                            spec = False
                        elif 'uncultured' in SSU[tax_id][spec_idx] or 'unidentified' in SSU[tax_id][spec_idx]:
                            spec_idx -= 1
                        elif 'Candidate' in SSU[tax_id][spec_idx]:
                            spec = tuple(SSU[tax_id][:spec_idx+1])
                            cult_control = 1
                        else:
                            spec = tuple(SSU[tax_id][:spec_idx+1])
                            cult_control = 1
                    if SSU[tax_id][0] == 'Bacteria':
                        bac_arch[0] += 1
                    elif SSU[tax_id][0] == 'Archaea':
                        bac_arch[1] += 1
                    else:
                        pass
                    if spec:
                        if spec == 'Phaseolus_acutifolius_(tepary_bean)':
                            pass
                        else:
                            if spec not in bac_dict:
                                bac_dict[spec] = 1
                            else:
                                bac_dict[spec] += 1
                    else:
                        pass
                cut_dict = dict_purify(bac_dict)
                full_bac_dict = tuple_to_dict(cut_dict)
                tax_bac_dict = tuple_to_xml_dict(cut_dict)
                return full_bac_dict, tax_bac_dict
            if typ == 'ITS':
                fun_dict = {}
                for linia in linie:
                    tax_id = linia.split('\t')[1].split('|')[0]
                    spec_name = tuple(SSU[tax_id])
                    if spec_name not in fun_dict:
                        fun_dict[spec_name] = 1
                    else:
                        fun_dict[spec_name] += 1
                cut_dict = dict_purify(fun_dict)
                full_fun_dict = tuple_to_dict(cut_dict)
                tax_fun_dict = tuple_to_xml_dict(cut_dict)
                return full_fun_dict, tax_fun_dict


[docs]def input_locations(mode, out_types):
    """Generates lists of locations where input files are located.
    Lists are grouped by directories and later, by "output types".

    As input files considered are only files,
    which meet all the following conditions:

        - they are located in current working directory
          or in subdirectories of current working directory,
        - have suffix of filename equal to `out_type`,
        - if `out_type` is 'ITS' or '16S', filenames also
          have to contain 'usearch\_' before the `out_type` in name.

    Args:
        mode: a parameter passed to :func:`cat_read` function.
            If mode is 'run', the gunzip command will be executed,
            to extract compressed files.

        out_types: a list with "output types",
            typically consisting  of: 'ITS', '16S' or 'txt'.

    Returns:
        A dict where keys are "output types", values are dicts describing
        file locations returned by :func:`cat_read` function.

        Example::

            {
                'ITS':
                {
                    directory_path1:
                        [file1_in_this_directory_path,
                        file2_in_this_directory_path,
                        file3_in_this_directory_path],
                    directory_path2:
                        [file4_in_this_directory_path,
                        file5_in_this_directory_path,
                        file6_in_this_directory_path]
                }
            }
    """
    in_dict = {}
    for out_type in out_types:
        if out_type in ['ITS', '16S']:
            in_dict[out_type] = cat_read(mode, 'usearch_' + out_type, False)
        else:
            in_dict[out_type] = cat_read(mode, out_type, False)
    return in_dict


[docs]def dict_prepare(typ, indict, SSU):
    """Runs files analysis and then, creates two dicts with summarized
    taxonomic data. The resulting dicts keep information about files
    from which given taxa comes.

    Args:
        typ: an "output type" - typically: 'ITS', '16S' or 'txt'.

        indict: A dict with file structure, where:
                - keys are directories,
                - values are lists of files.

            Example in description od output of :func:`cat_read` function.

        SSU: A dict with taxonomic data, where:
                - keys are sequence identifiers;
                - values are lists of taxonomic terms, in order:
                  from the most generic to the most specific one.

            Example in description of output of :func:`SSU_read` function.

    Returns:
        A tuple: (all_dicts, all_tax_dict)

        all_dicts: A dict, where
            - keys are file identifiers,
            - values are taxonomic trees with numbers of occurrences
              of particular species placed inside leafs

            Example::

                {
                    'filename_1.ext':
                    {
                        'Bacillaceae': {'Anoxybacillus': {'subsum': 15}}
                    },
                    'filename_2.ext':
                    {
                        'Listeriaceae':
                        {
                            'Listeria':
                            {
                                'Lgrayi': {'subsum': 22},
                                'Linnocua': {'subsum': 11}
                            }
                        }
                    }
                }

        all_tax_dict: A dict, where

            - keys are file identifiers,
            - values are dicts, where
                - keys are names of taxa,
                - values are numbers of occurrences of particular taxon.

            Example::

                {
                    'filename_1.ext':
                    {
                        'Bacillaceae': 5, 'Anoxybacillus': 5
                    },
                    'filename_2.ext':
                    {
                        'Listeriaceae': 19, 'Listeria': 19, 'Lgrayi': 16, 'Linnocua': 3
                    }
                }
    """
    all_dicts = {}
    all_tax_dict = {}
    for path in indict.keys():
        for plik in indict[path]:
            plik_id = '_'.join([typ, split(plik, '.')[0]])
            analysed_dict, tax_dict = file_analysis(typ, pjoin(path, plik), SSU)
            all_dicts[plik_id] = analysed_dict
            all_tax_dict[plik_id] = tax_dict
    return all_dicts, all_tax_dict


[docs]def dict_sum(dicto, val):
    """Sums the values in the given dict,
    using recurrence to handle nested dicts.

    Args:
        dicto: a dict, where values are of any type with defined + operation
        val: an int - initial value

    Returns:
        val:
            an int - sum of all values in given dict plus
            value given as second argument
    """
    for key in dicto.keys():
        if isinstance(dicto[key], dict):
            val = dict_sum(dicto[key], val)
        else:
            val += dicto[key]
    return val


[docs]def update_dict(tax_tree, curr_tax):
    """Updates the dict with keys form another one.
    If during walking the dict, the 'subsum' string is found,
    recurrence stops and nodes beneath are ignored.
    The dicts have to contain only other dicts or 'subsum'!

    Args:
        tax_tree: a dict to update
        curr_tax: a dict to be merged into the `tax_tree`

    Returns:
        tax_tree: updated dict

    Example:
        input (tax_tree, curr_tax)::

            (
                {
                    'Listeriaceae':
                    {
                        'Listeria': { 'Lgrayi': {} }
                    }
                },
                {
                    'Listeriaceae':
                    {
                        'Listeria': { 'Linnocua': {'subsum': 11} }
                    }
                }
            )

        output::

            {
                'Listeriaceae':
                {
                    'Listeria':
                    {
                        'Lgrayi': {},
                        'Linnocua': {}
                    }
                }
            }
    """
    for key in curr_tax:
        if key != 'subsum':
            if key not in tax_tree:
                tax_tree[key] = {}
                if curr_tax[key] != 'subsum':
                    tax_tree[key] = update_dict(tax_tree[key], curr_tax[key])
            else:
                tax_tree[key] = update_dict(tax_tree[key], curr_tax[key])
    return tax_tree


[docs]def tree_of_life(full_dict):
    """Rewrites the data from a dict containing information arranged by
    type and then by file into two dicts:

        - first represents full taxonomic tree
        - second presents how many species are inside particular files

    Args:
        full_dict:
            a dict of dicts, with structure like::

                {
                    'output_type_1':
                    {
                        'filename_1.ext':
                        {
                            'Bacillaceae': {'Anoxybacillus': {'subsum': 15}}
                        }
                    }
                }

            Longer example available in description of :func:`xml_format`.

    Returns:
        A tuple (full_tree, file_total_count)

        full_tree:
            A dict without information about type and file.

            Example::

                {
                    'Bacillaceae':
                    {
                        'Anoxybacillus': {}
                    },
                    'Listeriaceae':
                    {
                        'Listeria':
                        {
                            'Lgrayi': {}, 'Linnocua': {}
                        }
                    }
                }

        file_total_count:
            A dict within information about files are kept and values of every
            leaf are summed into values representing total count of species
            inside particular file.

            Example::

                {'filename_1.ext': 15, 'filename_2.ext': 33}
    """
    file_total_count = {}
    full_tree = {}
    for typ in full_dict:
        for plik in full_dict[typ]:
            file_total_count[plik] = dict_sum(full_dict[typ][plik], 0)
            curr_tax = full_dict[typ][plik]
            full_tree = update_dict(full_tree, curr_tax)
    return full_tree, file_total_count


[docs]def xml_name_parse(full_dict):
    """Creates a list of simplified filenames (full filenames comes
    from `full_dict`). The list is guaranteed to not have any duplicates.

    If filename contains sequence of nucleotides from set {A, C, T, G},
    separated from other parts of the name by underscore (_), then
    simplified filename will be substring of filename from beginning
    to this sequence (inclusive). This set of nucleotides in most cases
    will represent "index" from naming schema for fastq files, so
    since this step, many files will belong to one alias.

    Otherwise, the first chunk of filename will be used. In this case,
    filename will be splitted with dot '.' as delimiter. It also
    indicates grouping files with similar filenames since this step.

    Keep in mind, that this step groups files referring to the same
    dataset under single name, and technically is vulnerable for errors:
    If filenames given to this functions do not follow appropriate
    naming schema or when these filenames are not named in prefix-code
    convention, then some false-positives might be generated in
    some of functions which use results of xml_name_parse().

    Example:
        Following filenames::

            '16S_ArchV3V4_M_BF_02_TAGCTT_L001_001.amplicons.cutadapt.flash.merged.fastq.extendedFrags.fasta.usearch_16S'
            '16S_ArchV3V4_M_BF_02_TAGCTT_L001_001.cutadapt.amplicons.cutadapt.flash.merged.fastq.extendedFrags.fasta.usearch_16S'
            'Amp45_BFp_B_CAAAAG_L001_R12.fasta.usearch_ITS'
            'meta-velvetg.contigs.fa.usearch_ITS'

        will become::

            '16S_ArchV3V4_M_BF_02_TAGCTT' # note: this groups first two files
            'Amp45_BFp_B_CAAAAG'
            'meta-velvetg'

    Args:
        full_dict:

            a dict of dicts, with structure like::

                {
                    'output_type_1':
                    {
                        'filename_1.ext':
                        {
                            'Bacillaceae': {'Anoxybacillus': {'subsum': 15}}
                        }
                    }
                }

            Longer example avaliable in description of :func:`xml_format`.

    Returns:
        A list with simplified filenames.
        The list is guaranteed to not have any duplicates.

        Example::

            ['filename_1', 'filename_2']

    """
    # Make set for names.
    # It allows to get rid of duplicates, but order of elements is lost.
    name_set = set()
    for typ in full_dict:
        for filename in full_dict[typ]:
            # Remove extension from filename
            display_name = split(filename, '.')[0]
            try:
                nameparts = split(filename, '_')
                nucleotides = set('ACTG')
                # let identifier be the first part of filename, that is
                # comprised only of nucleotides ACTG.
                identifier = [
                    item
                    for item in nameparts
                    if not set(item).difference(nucleotides)
                ][0]
                display_name = '_'.join(
                    nameparts[0:nameparts.index(identifier) + 1]
                )
            except:
                pass
            name_set.add(display_name)
    name_list = list(name_set)
    return name_list


[docs]def xml_vals(xml_names, tax_dict):
    """Reformats information extracted from tax_dict into another dict,
    to allow use of this data in xml-generation process.

    Args:
        xml_names:
            readable identifiers of groups of files referring to
            common dataset, derived from filenames.

            Example::

                ['filename_1', 'filename_2']

        tax_dict:
            A dict, where:
                - keys are file identifiers,
                - values are dicts, where:
                    - keys are names of taxa,
                    - values are numbers of occurrences of particular taxon.

            Example::

                {
                    'filename_1.ext':
                    {
                        'Bacillaceae': 5, 'Anoxybacillus': 5
                    },
                    'filename_2.ext':
                    {
                        'Bacillaceae': 1, 'Anoxybacillus': 1
                        'Listeriaceae': 19, 'Listeria': 19, 'Lgrayi': 16, 'Linnocua': 3
                    }
                }

    Returns: dict:
        A dict, where:
            - keys are names of taxa,
            - values are lists with counters of occurrences of particular
              taxon in subsequent groups of files. Order on this list
              is defined by order of names in xml_names list.

        Example::

            {
                'Anoxybacillus': [5, 1],
                'Bacillaceae': [5, 1],
                'Lgrayi': [0, 16],
                'Linnocua': [0, 3],
                'Listeria': [0, 19],
                'Listeriaceae': [0, 19]
            }

    Warning:
        Keep in mind, that this step - along with generation of xml_names in
        :func:`xml_name_parse` - groups files referring to the same dataset
        under a single name, and technically is vulnerable for some errors:
        If filenames given to these functions do not follow appropriate
        naming schema or when these filenames are not named in prefix-code
        convention, then some false-positives might be generated when
        grouping results and this will influence results.
    """
    # create a set of all taxa from tax_dict
    all_tax_set = set()
    for plik in tax_dict:
        for level in tax_dict[plik]:
            all_tax_set.add(level)

    xml_dict = {}
    for tax in all_tax_set:
        xml_dict[tax] = []
        for name in xml_names:
            for plik in tax_dict:
                # Following if might generate false positives,
                # but it is partially desired. If there is a lot of
                # lanes, reads, etc for given dataset then everything
                # will be grouped under a single "name", thanks
                # to how the names are generated in :func:`xml_name_parse`
                # function, and thanks to this line:
                if name in plik:
                    if tax in tax_dict[plik]:
                        xml_dict[tax].append(tax_dict[plik][tax])
                    else:
                        xml_dict[tax].append(0)
    return xml_dict


[docs]def prettify(elem):
    """Creates a string representation of an XML element by calling
    xml.etree.ElementTree.tostring() and then, parses this string again
    to obtain pretty-printed XML string, where indents are four spaces long.

    Args:
        elem: instance of class Element from xml.etree.ElementTree

    Returns:
        A string containing pretty-printed XML of <elem>
    """
    import xml.etree.ElementTree as ET
    from xml.dom import minidom
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="    ")


[docs]def deunique(node):
    """Extract name of a node from string representing taxonomic position
    of this node.

    Example:
        input::

            'lvl1_____lvl2_____lvl_3_____lvln'

        output::

            'lvln'

    Args:
        node:
            a string with a list of ancestors of the node
            and the node itself, separated by '_____'.

    Returns:
        A string with pure name of the node.

    See Also:
        :func:`linia_unique` function
    """
    clean_node = node.split('_____')[-1]
    return clean_node


[docs]def xml_prepare(xml_names, xml_dict, tax_tree, name_total_count, unit='reads'):
    """Generates XML string, dedicated to use with Krona.
    XML is generated with use of default python xml module.

    Args:

        xml_names: a list of readable identifiers of datasets.

            Example::

                ['filename_1', 'filename_2']

        xml_dict: a dict with lists of occurrences of nodes by node.
            Keys are names of nodes. Values are lists of constant length,
            where every item on position *X* means: number of occurrences
            of node in dataset on position *X* on list xml_names.

            Note, that for all i: len(xml_dict[i]) is equal to len(xml_names).

            Example::

                {node_1: [count_1, count_2, ... count_x],
                node_2: [count_1, count_2, ... count_x]}

        tax_tree:
            a dict of dicts, etc. In form of nested dicts,
            represents phylogenetic tree.
            Example::

                {a:{aa:{}, ab:{aba:{}, abb:{abba:{}}}}}

        name_total_count:
            a dict with numbers of occurrences of nodes by dataset.
            Keys are names of dataset from xml_names, values are counts.
            Example::

                {identifier_1: count_1, identifier_2: count_2}

        unit: A string. Defines units to be used in the Krona.


    Returns:
        A single string with generated XML.

    HARDCODED:
        Number of levels to iterate in tax_tree is hardcoded to 9.
        As original comment states:
        'however it ignores all absent levels! which is actually kind of cool'
    """
    # Create a xml
    import xml.etree.ElementTree as ET
    krona = ET.Element('krona')

    # Set units
    attributes = ET.SubElement(krona, 'attributes', magnitude=unit)
    attribute = ET.SubElement(attributes, 'attribute', display=unit)
    attribute.text = unit

    # Put dataset names into the xml
    datasets = ET.SubElement(krona, 'datasets')
    for name in xml_names:
        dataset = ET.SubElement(datasets, 'dataset')
        dataset.text = name

    # Put counts adequate to corresponding names into the xml
    node = ET.SubElement(krona, 'node', name='%s count'%(unit))
    reads = ET.SubElement(node, unit)
    for name in xml_names:
        val = ET.SubElement(reads, 'val')
        val.text = str(name_total_count[name])

    # Translate tex_tree into xml
    for lvl_1 in tax_tree:
        node1 = ET.SubElement(node, 'node', name=deunique(lvl_1))
        reads1 = ET.SubElement(node1, unit)
        for element in xml_dict[lvl_1]:
            val = ET.SubElement(reads1, 'val')
            val.text = str(element)
        tmp_2_d = tax_tree[lvl_1]
        for lvl_2 in tmp_2_d:
            node2 = ET.SubElement(node1, 'node', name=deunique(lvl_2))
            reads2 = ET.SubElement(node2, unit)
            for element in xml_dict[lvl_2]:
                val = ET.SubElement(reads2, 'val')
                val.text = str(element)
            tmp_3_d = tmp_2_d[lvl_2]
            for lvl_3 in tmp_3_d:
                node3 = ET.SubElement(node2, 'node', name=deunique(lvl_3))
                reads3 = ET.SubElement(node3, unit)
                for element in xml_dict[lvl_3]:
                    val = ET.SubElement(reads3, 'val')
                    val.text = str(element)
                tmp_4_d = tmp_3_d[lvl_3]
                for lvl_4 in tmp_4_d:
                    node4 = ET.SubElement(node3, 'node', name=deunique(lvl_4))
                    reads4 = ET.SubElement(node4, unit)
                    for element in xml_dict[lvl_4]:
                        val = ET.SubElement(reads4, 'val')
                        val.text = str(element)
                    tmp_5_d = tmp_4_d[lvl_4]
                    for lvl_5 in tmp_5_d:
                        node5 = ET.SubElement(node4, 'node', name=deunique(lvl_5))
                        reads5 = ET.SubElement(node5, unit)
                        for element in xml_dict[lvl_5]:
                            val = ET.SubElement(reads5, 'val')
                            val.text = str(element)
                        tmp_6_d = tmp_5_d[lvl_5]
                        for lvl_6 in tmp_6_d:
                            node6 = ET.SubElement(node5, 'node', name=deunique(lvl_6))
                            reads6 = ET.SubElement(node6, unit)
                            for element in xml_dict[lvl_6]:
                                val = ET.SubElement(reads6, 'val')
                                val.text = str(element)
                            tmp_7_d = tmp_6_d[lvl_6]
                            for lvl_7 in tmp_7_d:
                                node7 = ET.SubElement(node6, 'node', name=deunique(lvl_7))
                                reads7 = ET.SubElement(node7, unit)
                                for element in xml_dict[lvl_7]:
                                    val = ET.SubElement(reads7, 'val')
                                    val.text = str(element)
                                tmp_8_d = tmp_7_d[lvl_7]
                                for lvl_8 in tmp_8_d:
                                    node8 = ET.SubElement(node7, 'node', name=deunique(lvl_8))
                                    reads8 = ET.SubElement(node8, unit)
                                    for element in xml_dict[lvl_8]:
                                        val = ET.SubElement(reads8, 'val')
                                        val.text = str(element)
                                    tmp_9_d = tmp_8_d[lvl_8]
                                    for lvl_9 in tmp_9_d:
                                        node9 = ET.SubElement(node8, 'node', name=deunique(lvl_9))
                                        reads9 = ET.SubElement(node9, unit)
                                        for element in xml_dict[lvl_9]:
                                            val = ET.SubElement(reads9, 'val')
                                            val.text = str(element)

    # Push created xml structure to beautician and return resulting string
    return prettify(krona)


[docs]def name_total_reduction(xml_names, file_total_count):
    """Rewrites given <file_total_count> dict, to group total counts
    by names (datasets), instant of grouping by filenames.

    Keep in mind, that this step - along with generation of xml_names
    in xml_name_parse - groups files referring to the same dataset under
    single name, and technically is vulnerable for some errors.

    See also:
        description of :func:`xml_vals` function

    Args:
        xml_names: readable identifiers of groups of files referring to
            common dataset; derived from filenames.
            Example::

                ['filename_1', 'filename_2']

        file_total_count: A dict within information about files are kept
            and values of every leaf are summed into values (usually
            representing total count of species inside particular file)

            Example::

                {
                    'filename_1.ext': 15,
                    'filename_2.ext': 33,
                    'filename_2_part_2.ext': 33
                }

    Returns:
        A dict, where keys are names as present in xml_names and values
        are summed total counts (usually of occurrences of different species).

        Example::

            {'filename_1': 15, 'filename_2': 66}
    """
    name_total_count = {}
    for name in xml_names:
        count = 0
        for plik in file_total_count:
            if name in plik:
                count += file_total_count[plik]
            name_total_count[name] = count
    return name_total_count


[docs]def xml_format(full_dict, tax_dict):
    """Creates a set of dicts, which are useful to generate xml files from.

    Args:
        full_dict:
            a dict of dicts with structure like::

                {
                    'output_type_1':
                    {
                        'filename_1.ext':
                        {
                            'Bacillaceae': {'Anoxybacillus': {'subsum': 15}}
                        },
                        'filename_2.ext':
                        {
                            'Listeriaceae':
                            {
                                'Listeria':
                                {
                                    'Lgrayi': {'subsum': 22},
                                    'Linnocua': {'subsum': 11}
                                }
                            }
                        }
                    },
                    'output_type_2':
                    {
                        'filename_2.ext':
                        {
                            'Bacillaceae': {'Anoxybacillus': {'subsum': 15}}
                        }
                    }
                }

            "output type" typically will be 'ITS', '16S'.

        tax_dict:
            A dict, where:
                - keys are file identifiers,
                - values are dicts, where:
                    - keys are names of taxa,
                    - values are numbers of occurrences of particular taxon.

            Example::

                {
                    'filename_1.ext':
                    {
                        'Bacillaceae': 5, 'Anoxybacillus': 5
                    },
                    'filename_2.ext':
                    {
                        'Bacillaceae': 1, 'Anoxybacillus': 1,
                        'Listeriaceae': 19, 'Listeria': 19,
                        'Lgrayi': 16, 'Linnocua': 3
                    }
                }


    Returns:
        A tuple: (xml_names, xml_dict, tax_tree, name_total_count)

        xml_names:
            readable identifiers of groups of files referring to common dataset;
            derived from filenames.

            Example::

                ['filename_1', 'filename_2']

        xml_dict:
            a dict, where:
                - keys are names of taxa,
                - values are lists with counters of occurences of particular
                  taxon in subsequent groups of files. Order on this list
                  is defined by order of names in xml_names list.

            Example::

                {
                    'Anoxybacillus': [5, 1],
                    'Bacillaceae': [5, 1],
                    'Lgrayi': [0, 16],
                    'Linnocua': [0, 3],
                    'Listeria': [0, 19],
                    'Listeriaceae': [0, 19]
                }

        tax_tree:
            a dict without information about type and file.

            Example::

                {
                    'Bacillaceae':
                    {
                        'Anoxybacillus': {}
                    },
                    'Listeriaceae':
                    {
                        'Listeria':
                        {
                            'Lgrayi': {}, 'Linnocua': {}
                        }
                    }
                }


        name_total_count:
            a dict, where keys are names as present in
            xml_names and values are summed total counts (usually of
            occurrences of different species).

            Example::

                {'filename_1': 5, 'filename_2': 20}
    """
    # Rewrites the data from a dict containing information arranged by
    # type and then by file into two dicts:
    #   - first represents full taxonomic tree
    #   - second presents how many species are inside particular files
    tax_tree, file_total_count = tree_of_life(full_dict)

    # Creates a list of simplified filenames (full filenames comes
    # from <full_dic>). The list is guaranteed to not have duplicates.
    xml_names = xml_name_parse(full_dict)

    # Reformats information extracted from tax_dict into another dict,
    # to allow use of this data in xml-generation process.
    xml_dict = xml_vals(xml_names, tax_dict)

    # Rewrites given <file_total_count> dict, to group total counts
    # by names (datasets), instant of grouping by filenames.
    name_total_count = name_total_reduction(xml_names, file_total_count)

    return xml_names, xml_dict, tax_tree, name_total_count


[docs]def out_namespace(curr, out_types):
    """Generates paths to places, where krona xml and krona html files
    are or will be placed. Path composition varies, according to given
    arguments and always contains some string representation of `out_type`.
    Paths may start in current working directory or in directory specified by `curr`.

    If 'txt' is one of output_types, the basename of output files will be
    'humann-graphlan'; otherwise, the basename will be concatenation of
    elements from 'out_types' list, with underscore '_' as glue.

    In case of multiple elements on <out_types> list, they will be sorted, so
    the output files will have consistent names regardless to parameters' order.

    Examples of generated filenames, basing on <out_types>:

        - for ['ITS']:          'ITS.krona', 'ITS.html'
        - for ['ITS', '16S']:   '16S_ITS.krona', '16S_ITS.html'
        - for ['txt']:          'humann-graphlan.krona', 'humann-graphlan.html'


    Args:
        curr: A string - path to directory where the files are/will be placed.
            If curr is 'in_situ', the path will start in current working directory.

        out_type: A list that determines the basename of output files.
            In practice, we expect list of elements: 'ITS', '16S' or 'txt'.

    Returns:
        A tuple (out_xml, out_html):

        out_xml: a string with path to file.
            The filename is derived from (out_type) and file has '.krona' extension.

        out_html: a string with path to file.
            The filename is derived from (out_type) and file has '.html' extension.
    """
    if curr == 'in_situ':
        curr = getcwd()

    if 'txt' in out_types:
        out_xml = pjoin(curr, 'humann-graphlan.krona')
    else:
        out_xml = pjoin(curr, '_'.join(sorted(out_types)) + '.krona')

    out_html = out_xml.replace('krona', 'html')

    return out_xml, out_html


[docs]def txt_dict_clean(dicto):
    """Cleans given file structure by removing files
    which do not contain words "graplan" or "tree".
    Emptied directories are also removed.

    Args:
        dicto:
            A dict where:
                - keys are names of directories,
                - values are lists with filenames, which are in the directory.

            Example::

                {'dir_name_1': ['file_1','file_2'], 'dir_name_2': ['file_1']}

            Explicit example in description of output of :func:`cat_read`.

    Returns:

        A dict in the same format, as given one (without unwanted files)
    """
    to_remove = set()
    for directory in dicto:
        for plik in dicto[directory]:
            if ('graphlan' not in plik) or ('tree' not in plik):
                to_remove.add((directory, plik))
    for pair in to_remove:
        dicto[pair[0]].remove(pair[1])
    dir_to_del = []
    for directory in dicto:
        if len(dicto[directory]) == 0:
            dir_to_del.append(directory)
    for directory in dir_to_del:
        del dicto[directory]
    return dicto


[docs]def xml_names_graphlan(input_d):
    """Extracts values from given dict, puts the values on a list, and then
    removes from every value those parts that are prefixes and suffixes
    common for all items on the list.

    Args:
        input_d: A dict where:
            keys are names of directories,
            values are lists with filenames, which are in the directory.

            Example:

                {'dir_name_1': ['file_1','file_2'], 'dir_name_2': ['file_1']}

            Explicit example in description of output of :func:`cat_read`.

    Returns:
        A list with all filenames which were given on input,
        trimmed by common prefixes and suffixes.
    """
    from os.path import commonprefix as CP

    # create a set (and then a list) with all names of files given on input
    allnames = set()
    for path in input_d:
        files = set(input_d[path])
        allnames = allnames | files
    allnames = list(allnames)

    comm_pref = CP(allnames)

    allrev = []
    for plik in allnames:
        allrev.append(plik[::-1])

    comm_suff = CP(allrev)[::-1]

    # remove common prefix and suffix from every filename
    for idx in xrange(len(allnames)):
        file_name = allnames[idx]
        file_name = file_name[len(comm_pref):]
        file_name = file_name[:-len(comm_suff)]
        allnames[idx] = file_name

    return allnames


[docs]def tax_tree_extend(tax_tree, linia):
    """Recursively adds elements form list `linia` into the tree `tax_tree`.

    Args:
        tax_tree: tree represented as dict of dicts.
        linia: a list in order:
            from the oldest ancestor to the youngest descendant.
            Elements should be formatted to contain information
            about ancestors (like it does :func:`linia_unique` function).

    Returns:
        Extended tree in form of dict of dicts.
    """
    if linia[0] not in tax_tree:
        tax_tree[linia[0]] = {}
    if len(linia) > 1:
        tax_tree[linia[0]] = tax_tree_extend(tax_tree[linia[0]], linia[1:])
    return tax_tree


[docs]def linia_unique(linia):
    """Creates list where every element includes information about previous
    elements, in order, separated by "____".

    Example:
        input:

            ['1','2','3'],

        output:

            ['1', '1_____2', '1_____2_____3']

    Args:
        linia: a list.

    Returns:
        A list.

    See Also:
        :func:`deunique` function
   """
    line = linia[:]
    for idx in xrange(len(linia)):
        line[idx] = '_____'.join(linia[:idx+1])
    return line


[docs]def tax_tree_graphlan(input_d):
    """Reads and interprets information about taxonomic relations,
    from files in Graphlan-like format.

    Args:
        input_d: A dict where
            keys are names of directories,
            values are lists with filenames, which are in the directory.

            Example::

                {'dir_name_1': ['file_1','file_2'], 'dir_name_2': ['file_1']}

            Explicit example in description of output of :func:`cat_read`.

            Example of featured file::

                Listeriaceae.Listeria.Lgrayi
                Listeriaceae.Listeria.Linnocua

    Returns:
        A tuple (total_tax_tree, per_file_tax_tree, multi_flat_tax_tree):

        total_tax_tree:
            A dict of dicts, etc. In form of nested dicts, represents
            phylogenetic trees of entities described by files featured in the input.
            This is a total tree - nodes from all files are merged here.

            Example::

                {
                    'Bacillaceae':
                    {
                        'Bacillaceae_____Anoxybacillus': {}
                    },
                    'Listeriaceae':
                    {
                        'Listeriaceae_____Listeria':
                        {
                            'Listeriaceae_____Listeria_____Lgrayi': {},
                            'Listeriaceae_____Listeria_____Linnocua': {},
                        }
                    }
                }

        per_file_tax_tree:
            A dict of dicts, etc. Keys are filenames.
            In form of nested dicts, represents phylogenetic trees of entities,
            described by files featured in the input.

            Example::

                {
                    'annot_1.txt':
                    {
                        'Bacillaceae':
                        {
                            'Bacillaceae_____Anoxybacillus': {}
                        }
                    }
                    'annot_2.txt':
                    {
                        'Listeriaceae':
                        {
                            'Listeriaceae_____Listeria':
                            {
                                'Listeriaceae_____Listeria_____Lgrayi': {},
                                'Listeriaceae_____Listeria_____Linnocua': {},
                            }
                        }
                    }
                }

        multi_flat_tax_tree: A dict of dicts, where:
                - keys are filenames,
                - values are dicts, where:

                    - keys are names of nodes,
                    - values are numbers of nodes,
                      with names starting from "node_name"

            Example::

                {
                    'annot_1.txt':
                    {
                        'Bacillaceae': 8,
                        'Bacillaceae_____Anoxybacillus': 8
                    },
                    'annot_2.txt':
                    {
                        'Listeriaceae': 16,
                        'Listeriaceae_____Listeria': 16,
                        'Listeriaceae_____Listeria_____Lgrayi': 8,
                        'Listeriaceae_____Listeria_____Linnocua': 8
                    }
                }
    """
    total_tax_tree = {}
    per_file_tax_tree = {}
    multi_flat_tax_tree = {}
    for path in input_d:
        for plik in input_d[path]:
            multi_flat_tax_tree[plik] = {}
            kos = open(pjoin(path, plik), 'r').readlines()
            file_tax_tree = {}
            for line in kos:
                line = line.split('\t')[0]
                line = line.rstrip()
                linia = line.split('.')
                linia = linia_unique(linia)
                file_tax_tree = tax_tree_extend(file_tax_tree, linia)
                for node in linia:
                    try:
                        multi_flat_tax_tree[plik][node] += 1
                    except KeyError:
                        multi_flat_tax_tree[plik][node] = 1
            total_tax_tree.update(file_tax_tree)
            per_file_tax_tree[plik] = file_tax_tree
    return total_tax_tree, per_file_tax_tree, multi_flat_tax_tree


[docs]def xml_counts_graphlan(per_file_tax_tree, xml_names, multi_flat_tax_tree):
    """Groups and/or sums numbers of nodes (obtained from multi_flat_tax_tree):
        1. by node_name - grouping
        2. by identifier (obtained from filename) - summing

    Args:
        per_file_tax_tree: A dict of dicts, etc. Top-level keys are filenames.
            Taxonomic tree represented by nested dicts. Similar to tax_tree.
            Example included in description of :func:`tax_tree_graphlan`.

        xml_names: readable identifiers of files derived from filenames.
            Example::

                ['filename_1', 'filename_2']

        multi_flat_tax_tree: A dict of dicts, where:
                - keys are filenames,
                - values are dicts, where:
                    - keys are names of nodes,
                    - values are numbers of nodes with names
                      starting from "node_name"

            Explicit example in description of :func:`tax_tree_graphlan`.

    Returns:
        A tuple (xml_dict, name_total_count):
        xml_dict: A dict. Contains lists of numbers of occurrences of nodes by node.
                - keys are names of nodes.
                - values are lists of constant length, where every item on
                  position *X* means: number of occurrences of nodes with prefix
                  equal to *name of node* in file *X*.  *file X* is that file,
                  which is on position *X* on list xml_names.

            Example::

                {node: [count_1, count_2], node_2: [count_1, count_2]}

        name_total_count: A dict. Contains summed numbers of occurrences of nodes by file.
            - keys are identifiers (derived from filenames - look for xml_names),
            - values ale sums.

            Example::

                {identifier_1: count_1, identifier_2: count_1}
    """
    all_nodes = set()
    xml_dict = {}
    name_total_count = {}
    for plik in multi_flat_tax_tree:
        all_nodes = all_nodes | set(multi_flat_tax_tree[plik].keys())
    for node in all_nodes:
        xml_dict[node] = []
        for name in xml_names:
            for plik in multi_flat_tax_tree:
                if name in plik:
                    if node in multi_flat_tax_tree[plik]:
                        xml_dict[node].append(multi_flat_tax_tree[plik][node])
                    else:
                        xml_dict[node].append(0)
    for name in xml_names:
        for plik in multi_flat_tax_tree:
            if name in plik:
                name_total_count[name] = 0
                for node in per_file_tax_tree[plik]:
                    name_total_count[name] += multi_flat_tax_tree[plik][node]
    return xml_dict, name_total_count


[docs]def graphlan_to_krona(input_d):
    """Modifies files given by `inupt_d` from Graphlan format to set of
    xml strings, which may be assembled into a input for Krona.

    Args:
        input_d: A dict where:
                - keys are names of directories,
                - values are lists with filenames, which are in the directory.

            Example::

                {'dir_name_1': ['file_1','file_2'], 'dir_name_2': ['file_1']}

            Explicit example in description of output of :func:`cat_read`.

    Returns:
        A tuple (xml_names, xml_dict, tax_tree, name_total_count):

        xml_names: readable identifiers of files derived from filenames.
            Filenames comes from input_d dict.
            Example::

                [identifier_1, identifier_2, ..., identifier_x]

        xml_dict: Contains lists of numbers of occurrences of nodes, grouped by node:
                - keys are names of nodes,
                - values are lists of constant length, where every item on
                  position *X* means: number of occurrences of nodes with
                  prefix equal to *name of node* in file *X*. *file X* is that
                  file, which is on position *X* on list xml_names.

            Note, that for all i: len(xml_dict[i]) is equal to len(xml_names).

            Example::

                {node: [count_1, count_2, ... count_x], node_2: [count_1, count_2, ... count_x]}

        tax_tree: A dict of dicts, etc.
            In form of nested dicts, it represents taxonomic tree.
            Example::

                {a:{aa:{}, ab:{aba:{}, abb:{abba:{}}}}}

            Explicit example in description of `total_tax_tree` in
            :func:`tax_tree_graphlan`,

        name_total_count: A dict.
            Contains summed numbers of occurrences of nodes by file.

                - keys are identifiers (derived from filenames - check
                  ``xml_names`` description),
                - values are sums.

            Example::

                {identifier_1: count_1, identifier_2: count_2}
    """
    input_d = txt_dict_clean(input_d)
    xml_names = xml_names_graphlan(input_d)
    tax_tree, per_file_tax_tree, multi_flat_tax_tree = tax_tree_graphlan(input_d)
    xml_dict, name_total_count = xml_counts_graphlan(per_file_tax_tree, xml_names, multi_flat_tax_tree)
    return xml_names, xml_dict, tax_tree, name_total_count


[docs]def create_krona_xml(xml_string, out_xml):
    """Writes xml_string into the file of name out_xml.
    If the file does not exist, creates a new file.

    Args:
        xml_string: a string with content to write

        out_xml: a string with path to file
    """
    krona_write = open(out_xml, 'w')
    krona_write.write(xml_string)
    krona_write.close()


[docs]def create_krona_html(krona_html_name, krona_xml_name):
    """Runs script which creates Krona html file from krona xml file"""
    command = 'ktImportXML -o %s %s' % (krona_html_name, krona_xml_name)
    system(command)


[docs]def prepare_taxonomy_dicts(opts, input_dict):
    """ Prepares dicts with taxonomy stats.

    See Also:
        :func:`prepare_taxonomy_stats` for more information.
    """
    taxonomic_dbs = {'ITS': opts.db_taxonomy_ITS,
                     '16S': opts.db_taxonomy_16S}

    analysed_dict = {}
    pure_tax = {}

    for out_type in opts.output_type:
        # Extracts from specially formatted FASTA file taxonomic data
        # and returns them as hierarchically organised dict
        SSU = SSU_read(taxonomic_dbs[out_type], out_type)

        # Runs files analysis on data which were interpreted by SSU_read
        # and then, creates dicts with summarized taxonomic data
        all_dicts, all_tax_dict = dict_prepare(out_type, input_dict[out_type], SSU)
        analysed_dict[out_type] = all_dicts
        pure_tax.update(all_tax_dict)

    return analysed_dict, pure_tax


[docs]def prepare_taxonomy_stats(opts):
    """Performs statistical analysis of taxonomy from appropriate files
    from current working directory: counts occurrences of different taxa
    and prepares the results to be presented in HTML format.
    Results will be converted to HTML (with the Krona program),
    but only when `opts.mode` is set to 'run'.

    Args:
        opts: A namespace, where:

            opts.output_type:
                Allows to choice on which files the analysis will be
                performed and also determines basenames of output files.

                One of: ['ITS', '16S', 'txt']

            opts.mode:
                A mode in which the program will be run.

                One of: ['test', 'run']

            opts.out_dir:
                Indicates, where the output files should be located.
                To specify current working directory, use 'in_situ'.

                One of: ['in_situ', a_string_with_path_to_dir]

            opts.db_taxonomy_16S:
                Path to 16S database used in taxonomy classification
                (fasta with specially formatted headers)

            opts.db_taxonomy_ITS:
                Path to ITS database used in taxonomy classification
                (fasta with specially formatted headers)

    Input:

        Analysis will be performed on files, meeting all the following conditions:
            - files are located in current working directory
              or in subdirectories of current working directory,
            - suffix of filename is equal to `opts.output_type`,
            - if `opts.output_type` is 'ITS' or '16S', filenames
              contains 'usearch\_' before the `opts.output_type` in name.

    Output:

        Output files will be placed in `opts.out_dir` directory, with
        basenames depending on `opts.output_type`.

        Examples of filenames:
            - for 'ITS':                ('ITS.krona', 'ITS.html')
            - for ['ITS', '16S']:       ('ITS_16S.krona', 'ITS_16S.html')

    """
    # Generates list of locations were input files are located.
    input_dict = input_locations(opts.mode, opts.output_type)

    if 'txt' in opts.output_type:
        # no options alongside 'txt' allowed
        assert len(opts.output_type) == 1

        input_dict_for_txt = input_dict['txt']
        xml_names, xml_dict, tax_tree, name_total_count = graphlan_to_krona(input_dict_for_txt)
        units = 'processes'
    else:
        # only '16S' and 'ITS' allowed (if not 'txt')
        assert len(set(opts.output_type) - {'16S', 'ITS'}) == 0

        analysed_dict, pure_tax = prepare_taxonomy_dicts(opts, input_dict)
        xml_names, xml_dict, tax_tree, name_total_count = xml_format(analysed_dict, pure_tax)
        units = 'reads'

    xml_string = xml_prepare(xml_names, xml_dict, tax_tree, name_total_count, units)

    xml_name, html_name = out_namespace(opts.out_dir, opts.output_type)
    chdir('../')
    # Writes xml_string into the file given by out_namespace
    create_krona_xml(xml_string, xml_name)
    if opts.mode != 'run':
        return
    create_krona_html(html_name, xml_name)