seqsearch.search.vsearch

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11
 12# Internal modules #
 13from autopaths.file_path import FilePath
 14from seqsearch.search.core import CoreSearch
 15
 16# Third party modules #
 17from seqsearch import sh
 18
 19###############################################################################
 20class VSEARCHquery(CoreSearch):
 21    """
 22    A vsearch job.
 23
 24    Example commands:
 25
 26        vsearch --usearch_global queries.fas --db queries.fas --id 0.6
 27                --blast6out results.blast6 --maxaccepts 0 --maxrejects 0
 28
 29        ./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9
 30                  --alnout alnout.txt
 31
 32    Some interesting options form the manual follow:
 33
 34    --maxaccepts <positive integer>
 35    > Maximum number of hits to accept before stopping the search. The default
 36    value is 1. This option works in pair with --maxrejects. The search process
 37    sorts target sequences by decreasing number of k-mers they have in common
 38    with the query sequence, using that information as a proxy for sequence
 39    similarity. After pairwise alignments, if the first target sequence passes
 40    the acceptation criteria, it is accepted as best hit and the search process
 41    stops for that query. If --maxaccepts is set to a higher value, more hits
 42    are accepted.
 43    """
 44
 45    extension = 'vsearchout'
 46
 47    @property
 48    def command(self):
 49        # Executable #
 50        if self.executable: cmd = [self.executable.path]
 51        else:               cmd = ['vsearch']
 52        # Other parameters
 53        cmd += ['--usearch_global', self.query,
 54                '--db',             self.db,
 55                '--blast6out',      self.out_path,
 56                '--threads',        self.cpus]
 57        # Options #
 58        for k,v in self.params.items(): cmd += [k, v]
 59        # Return #
 60        return list(map(str, cmd))
 61
 62    #-------------------------------- RUNNING --------------------------------#
 63    def run(self, verbose=False):
 64        """Simply run the VSEARCH search locally."""
 65        # Check the executable is available #
 66        if self.executable:
 67            self.executable.must_exist()
 68        else:
 69            from plumbing.check_cmd_found import check_cmd
 70            check_cmd('vsearch')
 71        # Create the output directory if it doesn't exist #
 72        self.out_path.directory.create_if_not_exists()
 73        # Optionally print the command #
 74        if verbose:
 75            print("Running VSEARCH command:\n    %s" % ' '.join(self.command))
 76        # Run it #
 77        cmd    = sh.Command(self.command[0])
 78        result = cmd(self.command[1:], _out=self._out, _err=self._err)
 79        # Return #
 80        return result
 81
 82    #----------------------------- PARSE RESULTS -----------------------------#
 83    @property
 84    def results(self):
 85        """
 86        Parse the results and yield biopython SearchIO entries.
 87
 88        Beware:
 89        Some databases are not unique on the id, and this causes the parser to
 90        complain about duplicate entries and raise exceptions such as:
 91
 92            ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
 93            in this QueryResult.
 94
 95        Summary of the columns:
 96        https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
 97
 98            qseqid sseqid pident length mismatch gapopen qstart qend sstart send
 99            evalue bitscore
100
101        Warning: Unlike BLAST results, if a sequence got no hits it is NOT
102                 reported at all in VSEARCH. The number of entries yielded
103                 will not match the number of sequences at input.
104        """
105        # Import parsing library #
106        from Bio import SearchIO
107        # Avoid the warning #
108        import warnings
109        warnings.filterwarnings("ignore", 'BiopythonDeprecationWarning')
110        # Iterate over lines #
111        with open(self.out_path, 'rt') as handle:
112            for entry in SearchIO.parse(handle, 'blast-tab'):
113                yield entry
114
115###############################################################################
116class VSEARCHdb(FilePath):
117    """A VSEARCH database one can search against."""
118
119    def __repr__(self):
120        return '<%s on "%s">' % (self.__class__.__name__, self.path)
121
122    def __init__(self, path):
123        # Call parent constructor #
124        FilePath.__init__(self, path)
125        # Check if the corresponding FASTA exists #
126        self.fasta_path = self.replace_extension('fasta')
127
128    def __bool__(self):
129        """Does the indexed database actually exist?"""
130        return bool(self.replace_extension('udb'))
131
132    def create_if_not_exists(self, *args, **kwargs):
133        """If the indexed database has not been generated, generate it."""
134        if not self: return self.makedb(*args, **kwargs)
135
136    def makedb(self, output=None, stdout=None, verbose=False):
137        # We need a corresponding fasta that exists #
138        if not self.fasta_path:
139            raise Exception("No fasta file at '%s'." % self.fasta_path)
140        # Message #
141        if verbose:
142            msg = "Calling `makeudb_usearch` from '%s' to '%s'."
143            print(msg % (self.fasta_path, self))
144        # The output #
145        if output is None: output = self
146        # Options #
147        options = ['--makeudb_usearch', self.fasta_path, '--output', output]
148        # Call the program #
149        sh.vsearch(*options, _out=stdout)
150        # Return #
151        return output
class VSEARCHquery(seqsearch.search.core.CoreSearch):
 21class VSEARCHquery(CoreSearch):
 22    """
 23    A vsearch job.
 24
 25    Example commands:
 26
 27        vsearch --usearch_global queries.fas --db queries.fas --id 0.6
 28                --blast6out results.blast6 --maxaccepts 0 --maxrejects 0
 29
 30        ./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9
 31                  --alnout alnout.txt
 32
 33    Some interesting options form the manual follow:
 34
 35    --maxaccepts <positive integer>
 36    > Maximum number of hits to accept before stopping the search. The default
 37    value is 1. This option works in pair with --maxrejects. The search process
 38    sorts target sequences by decreasing number of k-mers they have in common
 39    with the query sequence, using that information as a proxy for sequence
 40    similarity. After pairwise alignments, if the first target sequence passes
 41    the acceptation criteria, it is accepted as best hit and the search process
 42    stops for that query. If --maxaccepts is set to a higher value, more hits
 43    are accepted.
 44    """
 45
 46    extension = 'vsearchout'
 47
 48    @property
 49    def command(self):
 50        # Executable #
 51        if self.executable: cmd = [self.executable.path]
 52        else:               cmd = ['vsearch']
 53        # Other parameters
 54        cmd += ['--usearch_global', self.query,
 55                '--db',             self.db,
 56                '--blast6out',      self.out_path,
 57                '--threads',        self.cpus]
 58        # Options #
 59        for k,v in self.params.items(): cmd += [k, v]
 60        # Return #
 61        return list(map(str, cmd))
 62
 63    #-------------------------------- RUNNING --------------------------------#
 64    def run(self, verbose=False):
 65        """Simply run the VSEARCH search locally."""
 66        # Check the executable is available #
 67        if self.executable:
 68            self.executable.must_exist()
 69        else:
 70            from plumbing.check_cmd_found import check_cmd
 71            check_cmd('vsearch')
 72        # Create the output directory if it doesn't exist #
 73        self.out_path.directory.create_if_not_exists()
 74        # Optionally print the command #
 75        if verbose:
 76            print("Running VSEARCH command:\n    %s" % ' '.join(self.command))
 77        # Run it #
 78        cmd    = sh.Command(self.command[0])
 79        result = cmd(self.command[1:], _out=self._out, _err=self._err)
 80        # Return #
 81        return result
 82
 83    #----------------------------- PARSE RESULTS -----------------------------#
 84    @property
 85    def results(self):
 86        """
 87        Parse the results and yield biopython SearchIO entries.
 88
 89        Beware:
 90        Some databases are not unique on the id, and this causes the parser to
 91        complain about duplicate entries and raise exceptions such as:
 92
 93            ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
 94            in this QueryResult.
 95
 96        Summary of the columns:
 97        https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
 98
 99            qseqid sseqid pident length mismatch gapopen qstart qend sstart send
100            evalue bitscore
101
102        Warning: Unlike BLAST results, if a sequence got no hits it is NOT
103                 reported at all in VSEARCH. The number of entries yielded
104                 will not match the number of sequences at input.
105        """
106        # Import parsing library #
107        from Bio import SearchIO
108        # Avoid the warning #
109        import warnings
110        warnings.filterwarnings("ignore", 'BiopythonDeprecationWarning')
111        # Iterate over lines #
112        with open(self.out_path, 'rt') as handle:
113            for entry in SearchIO.parse(handle, 'blast-tab'):
114                yield entry

A vsearch job.

Example commands:

vsearch --usearch_global queries.fas --db queries.fas --id 0.6
        --blast6out results.blast6 --maxaccepts 0 --maxrejects 0

./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9
          --alnout alnout.txt

Some interesting options form the manual follow:

--maxaccepts

Maximum number of hits to accept before stopping the search. The default value is 1. This option works in pair with --maxrejects. The search process sorts target sequences by decreasing number of k-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. After pairwise alignments, if the first target sequence passes the acceptation criteria, it is accepted as best hit and the search process stops for that query. If --maxaccepts is set to a higher value, more hits are accepted.

def run(self, verbose=False):
64    def run(self, verbose=False):
65        """Simply run the VSEARCH search locally."""
66        # Check the executable is available #
67        if self.executable:
68            self.executable.must_exist()
69        else:
70            from plumbing.check_cmd_found import check_cmd
71            check_cmd('vsearch')
72        # Create the output directory if it doesn't exist #
73        self.out_path.directory.create_if_not_exists()
74        # Optionally print the command #
75        if verbose:
76            print("Running VSEARCH command:\n    %s" % ' '.join(self.command))
77        # Run it #
78        cmd    = sh.Command(self.command[0])
79        result = cmd(self.command[1:], _out=self._out, _err=self._err)
80        # Return #
81        return result

Simply run the VSEARCH search locally.

results

Parse the results and yield biopython SearchIO entries.

Beware: Some databases are not unique on the id, and this causes the parser to complain about duplicate entries and raise exceptions such as:

ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
in this QueryResult.

Summary of the columns: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6

qseqid sseqid pident length mismatch gapopen qstart qend sstart send
evalue bitscore

Warning: Unlike BLAST results, if a sequence got no hits it is NOT reported at all in VSEARCH. The number of entries yielded will not match the number of sequences at input.

class VSEARCHdb(autopaths.file_path.FilePath):
117class VSEARCHdb(FilePath):
118    """A VSEARCH database one can search against."""
119
120    def __repr__(self):
121        return '<%s on "%s">' % (self.__class__.__name__, self.path)
122
123    def __init__(self, path):
124        # Call parent constructor #
125        FilePath.__init__(self, path)
126        # Check if the corresponding FASTA exists #
127        self.fasta_path = self.replace_extension('fasta')
128
129    def __bool__(self):
130        """Does the indexed database actually exist?"""
131        return bool(self.replace_extension('udb'))
132
133    def create_if_not_exists(self, *args, **kwargs):
134        """If the indexed database has not been generated, generate it."""
135        if not self: return self.makedb(*args, **kwargs)
136
137    def makedb(self, output=None, stdout=None, verbose=False):
138        # We need a corresponding fasta that exists #
139        if not self.fasta_path:
140            raise Exception("No fasta file at '%s'." % self.fasta_path)
141        # Message #
142        if verbose:
143            msg = "Calling `makeudb_usearch` from '%s' to '%s'."
144            print(msg % (self.fasta_path, self))
145        # The output #
146        if output is None: output = self
147        # Options #
148        options = ['--makeudb_usearch', self.fasta_path, '--output', output]
149        # Call the program #
150        sh.vsearch(*options, _out=stdout)
151        # Return #
152        return output

A VSEARCH database one can search against.

VSEARCHdb(path, *args, **kwargs)
59    def __new__(cls, path, *args, **kwargs):
60        """A Path object is in fact a string."""
61        return str.__new__(cls, cls.clean_path(path))

A Path object is in fact a string.

def create_if_not_exists(self, *args, **kwargs):
133    def create_if_not_exists(self, *args, **kwargs):
134        """If the indexed database has not been generated, generate it."""
135        if not self: return self.makedb(*args, **kwargs)

If the indexed database has not been generated, generate it.

def makedb(self, output=None, stdout=None, verbose=False):
137    def makedb(self, output=None, stdout=None, verbose=False):
138        # We need a corresponding fasta that exists #
139        if not self.fasta_path:
140            raise Exception("No fasta file at '%s'." % self.fasta_path)
141        # Message #
142        if verbose:
143            msg = "Calling `makeudb_usearch` from '%s' to '%s'."
144            print(msg % (self.fasta_path, self))
145        # The output #
146        if output is None: output = self
147        # Options #
148        options = ['--makeudb_usearch', self.fasta_path, '--output', output]
149        # Call the program #
150        sh.vsearch(*options, _out=stdout)
151        # Return #
152        return output
Inherited Members
autopaths.file_path.FilePath
first
prefix_path
prefix
name
filename
directory
count_bytes
count
size
contents
contents_utf8
md5
might_be_binary
contains_binary
magic_number
lines
read
create
touch
open
add_str
close
write
writelines
remove
copy
execute
replace_extension
new_name_insert
make_directory
must_exist
head
tail
move_to
rename
gzip_to
gzip_internal
gzip_external
gzip_pigz
ungzip_to
ungzip_internal
ungzip_external
zip_to
unzip_to
untar_to
untargz_to
untargz_to_internal
untargz_to_external
append
prepend
remove_line
remove_first_line
replace_line
replace_word
sed_replace
autopaths.base_path.BasePath
clean_path
short_prefix
extension
escaped
absolute_path
physical_path
relative_path
rel_path_from
exists
permissions
mdate
mdate_iso
cdate
cdate_iso
unix_style
wsl_style
win_style
with_tilda
with_home
builtins.str
encode
replace
split
rsplit
join
capitalize
casefold
title
center
expandtabs
find
partition
index
ljust
lower
lstrip
rfind
rindex
rjust
rstrip
rpartition
splitlines
strip
swapcase
translate
upper
startswith
endswith
removeprefix
removesuffix
isascii
islower
isupper
istitle
isspace
isdecimal
isdigit
isnumeric
isalpha
isalnum
isidentifier
isprintable
zfill
format
format_map
maketrans