seqsearch.search

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import multiprocessing
 12
 13# First party modules #
 14from seqsearch.search.blast   import BLASTquery, BLASTdb
 15from seqsearch.search.vsearch import VSEARCHquery
 16from seqsearch.search.hmmer   import HmmQuery
 17from plumbing.cache           import property_cached
 18from autopaths.file_path      import FilePath
 19
 20################################################################################
 21class SeqSearch(object):
 22    """
 23    A sequence similarity search.
 24    Is able to use different algorithms such as BLAST, VSEARCH, HMMER, BLAT etc.
 25    all through the same interface.
 26
 27    Input: - Series of sequences in a FASTA file.
 28           - A database to search against.
 29           - The type of the sequences ('prot' or 'nucl').
 30           - The type of algorithm to use. Currently BLAST, VSEARCH or HMMER.
 31           - Number of threads to use.
 32           - The desired output path.
 33           - An extra set of parameters to be given to the search command.
 34           - The filtering options:
 35              * BLAST supported:   - e-value
 36                                   - Maximum targets
 37                                   - Minimum identity (via manual output format)
 38                                   - Minimum query coverage (via manual output format)
 39              * VSEARCH supported: - Maximum targets
 40              * HMMER supported:   - e-value
 41
 42    Output: - List of identifiers in the database
 43              (object with significance value and identity attached)
 44
 45    Other possible algorithms:
 46
 47        * https://www.ncbi.nlm.nih.gov/pubmed/11932250
 48        * https://www.animalgenome.org/bioinfo/resources/manuals/wu-blast/
 49        * https://github.com/bbuchfink/diamond
 50    """
 51
 52    def __repr__(self):
 53        return '<%s object on %s>' % (self.__class__.__name__, self.input_fasta)
 54
 55    def __bool__(self):
 56        return bool(self.query)
 57
 58    def __init__(self,
 59                 input_fasta,
 60                 database,
 61                 seq_type    = 'prot' or 'nucl',      # What sequence type is the input fasta
 62                 algorithm   = 'blast' or 'vsearch',  # Which implementation do you want
 63                 num_threads = None,                  # How many processes to use
 64                 filtering   = None,                  # The result filtering options
 65                 out_path    = None,                  # Where the out file will be
 66                 params      = None,                  # Add extra params for the command line
 67                 _out        = None,                  # Store the stdout at this path
 68                 _err        = None):                 # Store the stderr at this path
 69        # Save attributes #
 70        self.input_fasta = input_fasta
 71        self.database    = database
 72        self.seq_type    = seq_type
 73        self.algorithm   = algorithm
 74        self.num_threads = num_threads
 75        self.filtering   = filtering
 76        self.out_path    = out_path
 77        self.params      = params
 78        self._out        = _out
 79        self._err        = _err
 80        # Assign default values #
 81        self.set_defaults()
 82        # Validate attributes #
 83        self.validate()
 84
 85    def set_defaults(self):
 86        """
 87        This method will replace empty attributes with defaults when this is
 88        needed.
 89        """
 90        # In case we got a special object, just use the blast_db attribute #
 91        if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'):
 92            self.database = self.database.blast_db
 93        if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'):
 94            self.database = self.database.vsearch_db
 95        # Otherwise in case we got a path, convert it to a BLASTdb #
 96        if self.algorithm == 'blast' and not isinstance(self.database, BLASTdb):
 97            self.database = BLASTdb(self.database)
 98        # The filtering options #
 99        if self.filtering is None: self.filtering = {}
100        # Output path default value #
101        if self.out_path is None:
102            self.out_path = self.input_fasta.prefix_path + '.' + \
103                            self.algorithm + 'out'
104        # Output path setting #
105        self.out_path = FilePath(self.out_path)
106        # Number of cores default value #
107        if self.num_threads is None or self.num_threads is True:
108            self.num_threads = min(multiprocessing.cpu_count(), 32)
109        # Extra params to be given to the search algorithm #
110        if self.params is None: self.params = {}
111
112    def validate(self):
113        """
114        This method will raise an Exception is any of the arguments passed by
115        the user are illegal.
116        """
117        # The FASTA file has to contain something #
118        assert self.input_fasta
119
120    @property
121    def query(self):
122        """The actual search object with all the relevant parameters."""
123        # Pick the right attribute #
124        if self.algorithm == 'blast':   return self.blast_query
125        if self.algorithm == 'vsearch': return self.vsearch_query
126        if self.algorithm == 'hmmer':   return self.hmmer_query
127        # Otherwise raise an exception #
128        msg = "The algorithm '%s' is not supported."
129        raise NotImplemented(msg % self.algorithm)
130
131    #------------------------ Methods that are forwarded ----------------------#
132    def run(self):
133        """Run the search."""
134        return self.query.run()
135
136    def filter(self):
137        """Filter the results accordingly."""
138        return self.query.filter(self.filtering)
139
140    @property
141    def results(self):
142        """Parse the results in a meaningful manner."""
143        return self.query.results
144
145    #-------------------------- BLAST IMPLEMENTATION -------------------------#
146    @property_cached
147    def blast_params(self):
148        """
149        A dictionary of options to pass to the blast executable.
150        These params should depend on the filtering options.
151        """
152        # Make a copy #
153        params = self.params.copy()
154        # Based on the e-value #
155        if 'e_value' in self.filtering:
156            params['-evalue'] = self.filtering['e_value']
157        # Based on the maximum number of hits #
158        if 'max_targets' in self.filtering:
159            params['-max_target_seqs'] = self.filtering['max_targets']
160        # Return #
161        return params
162
163    def select_blast_algo(self):
164        """Depends on the query type and the database type."""
165        # Pick the right executable #
166        db_type = self.database.seq_type
167        if self.seq_type == 'nucl' and db_type == 'nucl': return 'blastn'
168        if self.seq_type == 'prot' and db_type == 'prot': return 'blastp'
169        if self.seq_type == 'nucl' and db_type == 'prot': return 'blastx'
170        if self.seq_type == 'prot' and db_type == 'nucl': return 'tblastn'
171
172    @property_cached
173    def blast_query(self):
174        """Make a BLAST search object."""
175        # Make the query object #
176        return BLASTquery(query_path = self.input_fasta,
177                          db_path    = self.database,
178                          seq_type   = self.seq_type,
179                          params     = self.blast_params,
180                          algorithm  = self.select_blast_algo(),
181                          cpus       = self.num_threads,
182                          out_path   = self.out_path,
183                          _out       = self._out,
184                          _err       = self._err)
185
186    #------------------------- VSEARCH IMPLEMENTATION ------------------------#
187    @property_cached
188    def vsearch_params(self):
189        """
190        A dictionary of options to pass to the vsearch executable.
191        These params should depend on the filtering options.
192        """
193        # Make a copy #
194        params = self.params.copy()
195        # Based on the maximum number of hits #
196        if 'max_targets' in self.filtering:
197            params['-maxaccepts'] = self.filtering['max_targets']
198        # Return #
199        return params
200
201    @property_cached
202    def vsearch_query(self):
203        """Make a VSEARCH search object."""
204        return VSEARCHquery(query_path = self.input_fasta,
205                            db_path    = self.database,
206                            seq_type   = self.seq_type,
207                            params     = self.vsearch_params,
208                            algorithm  = "usearch_global",
209                            cpus       = self.num_threads,
210                            out_path   = self.out_path,
211                            _out       = self._out,
212                            _err       = self._err)
213
214    #-------------------------- HMMER IMPLEMENTATION -------------------------#
215    @property_cached
216    def hmmer_params(self):
217        params = self.params.copy()
218        if 'e_value' in self.filtering: params['-E'] = self.filtering['e_value']
219        return params
220
221    @property_cached
222    def hmmer_query(self):
223        """Make a HMMER search object."""
224        return HmmQuery(query_path = self.input_fasta,
225                        db_path    = self.database,
226                        seq_type   = self.seq_type,
227                        params     = self.hmmer_params,
228                        cpus       = self.num_threads,
229                        out_path   = self.out_path)
class SeqSearch:
 22class SeqSearch(object):
 23    """
 24    A sequence similarity search.
 25    Is able to use different algorithms such as BLAST, VSEARCH, HMMER, BLAT etc.
 26    all through the same interface.
 27
 28    Input: - Series of sequences in a FASTA file.
 29           - A database to search against.
 30           - The type of the sequences ('prot' or 'nucl').
 31           - The type of algorithm to use. Currently BLAST, VSEARCH or HMMER.
 32           - Number of threads to use.
 33           - The desired output path.
 34           - An extra set of parameters to be given to the search command.
 35           - The filtering options:
 36              * BLAST supported:   - e-value
 37                                   - Maximum targets
 38                                   - Minimum identity (via manual output format)
 39                                   - Minimum query coverage (via manual output format)
 40              * VSEARCH supported: - Maximum targets
 41              * HMMER supported:   - e-value
 42
 43    Output: - List of identifiers in the database
 44              (object with significance value and identity attached)
 45
 46    Other possible algorithms:
 47
 48        * https://www.ncbi.nlm.nih.gov/pubmed/11932250
 49        * https://www.animalgenome.org/bioinfo/resources/manuals/wu-blast/
 50        * https://github.com/bbuchfink/diamond
 51    """
 52
 53    def __repr__(self):
 54        return '<%s object on %s>' % (self.__class__.__name__, self.input_fasta)
 55
 56    def __bool__(self):
 57        return bool(self.query)
 58
 59    def __init__(self,
 60                 input_fasta,
 61                 database,
 62                 seq_type    = 'prot' or 'nucl',      # What sequence type is the input fasta
 63                 algorithm   = 'blast' or 'vsearch',  # Which implementation do you want
 64                 num_threads = None,                  # How many processes to use
 65                 filtering   = None,                  # The result filtering options
 66                 out_path    = None,                  # Where the out file will be
 67                 params      = None,                  # Add extra params for the command line
 68                 _out        = None,                  # Store the stdout at this path
 69                 _err        = None):                 # Store the stderr at this path
 70        # Save attributes #
 71        self.input_fasta = input_fasta
 72        self.database    = database
 73        self.seq_type    = seq_type
 74        self.algorithm   = algorithm
 75        self.num_threads = num_threads
 76        self.filtering   = filtering
 77        self.out_path    = out_path
 78        self.params      = params
 79        self._out        = _out
 80        self._err        = _err
 81        # Assign default values #
 82        self.set_defaults()
 83        # Validate attributes #
 84        self.validate()
 85
 86    def set_defaults(self):
 87        """
 88        This method will replace empty attributes with defaults when this is
 89        needed.
 90        """
 91        # In case we got a special object, just use the blast_db attribute #
 92        if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'):
 93            self.database = self.database.blast_db
 94        if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'):
 95            self.database = self.database.vsearch_db
 96        # Otherwise in case we got a path, convert it to a BLASTdb #
 97        if self.algorithm == 'blast' and not isinstance(self.database, BLASTdb):
 98            self.database = BLASTdb(self.database)
 99        # The filtering options #
100        if self.filtering is None: self.filtering = {}
101        # Output path default value #
102        if self.out_path is None:
103            self.out_path = self.input_fasta.prefix_path + '.' + \
104                            self.algorithm + 'out'
105        # Output path setting #
106        self.out_path = FilePath(self.out_path)
107        # Number of cores default value #
108        if self.num_threads is None or self.num_threads is True:
109            self.num_threads = min(multiprocessing.cpu_count(), 32)
110        # Extra params to be given to the search algorithm #
111        if self.params is None: self.params = {}
112
113    def validate(self):
114        """
115        This method will raise an Exception is any of the arguments passed by
116        the user are illegal.
117        """
118        # The FASTA file has to contain something #
119        assert self.input_fasta
120
121    @property
122    def query(self):
123        """The actual search object with all the relevant parameters."""
124        # Pick the right attribute #
125        if self.algorithm == 'blast':   return self.blast_query
126        if self.algorithm == 'vsearch': return self.vsearch_query
127        if self.algorithm == 'hmmer':   return self.hmmer_query
128        # Otherwise raise an exception #
129        msg = "The algorithm '%s' is not supported."
130        raise NotImplemented(msg % self.algorithm)
131
132    #------------------------ Methods that are forwarded ----------------------#
133    def run(self):
134        """Run the search."""
135        return self.query.run()
136
137    def filter(self):
138        """Filter the results accordingly."""
139        return self.query.filter(self.filtering)
140
141    @property
142    def results(self):
143        """Parse the results in a meaningful manner."""
144        return self.query.results
145
146    #-------------------------- BLAST IMPLEMENTATION -------------------------#
147    @property_cached
148    def blast_params(self):
149        """
150        A dictionary of options to pass to the blast executable.
151        These params should depend on the filtering options.
152        """
153        # Make a copy #
154        params = self.params.copy()
155        # Based on the e-value #
156        if 'e_value' in self.filtering:
157            params['-evalue'] = self.filtering['e_value']
158        # Based on the maximum number of hits #
159        if 'max_targets' in self.filtering:
160            params['-max_target_seqs'] = self.filtering['max_targets']
161        # Return #
162        return params
163
164    def select_blast_algo(self):
165        """Depends on the query type and the database type."""
166        # Pick the right executable #
167        db_type = self.database.seq_type
168        if self.seq_type == 'nucl' and db_type == 'nucl': return 'blastn'
169        if self.seq_type == 'prot' and db_type == 'prot': return 'blastp'
170        if self.seq_type == 'nucl' and db_type == 'prot': return 'blastx'
171        if self.seq_type == 'prot' and db_type == 'nucl': return 'tblastn'
172
173    @property_cached
174    def blast_query(self):
175        """Make a BLAST search object."""
176        # Make the query object #
177        return BLASTquery(query_path = self.input_fasta,
178                          db_path    = self.database,
179                          seq_type   = self.seq_type,
180                          params     = self.blast_params,
181                          algorithm  = self.select_blast_algo(),
182                          cpus       = self.num_threads,
183                          out_path   = self.out_path,
184                          _out       = self._out,
185                          _err       = self._err)
186
187    #------------------------- VSEARCH IMPLEMENTATION ------------------------#
188    @property_cached
189    def vsearch_params(self):
190        """
191        A dictionary of options to pass to the vsearch executable.
192        These params should depend on the filtering options.
193        """
194        # Make a copy #
195        params = self.params.copy()
196        # Based on the maximum number of hits #
197        if 'max_targets' in self.filtering:
198            params['-maxaccepts'] = self.filtering['max_targets']
199        # Return #
200        return params
201
202    @property_cached
203    def vsearch_query(self):
204        """Make a VSEARCH search object."""
205        return VSEARCHquery(query_path = self.input_fasta,
206                            db_path    = self.database,
207                            seq_type   = self.seq_type,
208                            params     = self.vsearch_params,
209                            algorithm  = "usearch_global",
210                            cpus       = self.num_threads,
211                            out_path   = self.out_path,
212                            _out       = self._out,
213                            _err       = self._err)
214
215    #-------------------------- HMMER IMPLEMENTATION -------------------------#
216    @property_cached
217    def hmmer_params(self):
218        params = self.params.copy()
219        if 'e_value' in self.filtering: params['-E'] = self.filtering['e_value']
220        return params
221
222    @property_cached
223    def hmmer_query(self):
224        """Make a HMMER search object."""
225        return HmmQuery(query_path = self.input_fasta,
226                        db_path    = self.database,
227                        seq_type   = self.seq_type,
228                        params     = self.hmmer_params,
229                        cpus       = self.num_threads,
230                        out_path   = self.out_path)

A sequence similarity search. Is able to use different algorithms such as BLAST, VSEARCH, HMMER, BLAT etc. all through the same interface.

Input: - Series of sequences in a FASTA file. - A database to search against. - The type of the sequences ('prot' or 'nucl'). - The type of algorithm to use. Currently BLAST, VSEARCH or HMMER. - Number of threads to use. - The desired output path. - An extra set of parameters to be given to the search command. - The filtering options: * BLAST supported: - e-value - Maximum targets - Minimum identity (via manual output format) - Minimum query coverage (via manual output format) * VSEARCH supported: - Maximum targets * HMMER supported: - e-value

Output: - List of identifiers in the database (object with significance value and identity attached)

Other possible algorithms:

* https://www.ncbi.nlm.nih.gov/pubmed/11932250
* https://www.animalgenome.org/bioinfo/resources/manuals/wu-blast/
* https://github.com/bbuchfink/diamond
SeqSearch( input_fasta, database, seq_type='prot', algorithm='blast', num_threads=None, filtering=None, out_path=None, params=None, _out=None, _err=None)
59    def __init__(self,
60                 input_fasta,
61                 database,
62                 seq_type    = 'prot' or 'nucl',      # What sequence type is the input fasta
63                 algorithm   = 'blast' or 'vsearch',  # Which implementation do you want
64                 num_threads = None,                  # How many processes to use
65                 filtering   = None,                  # The result filtering options
66                 out_path    = None,                  # Where the out file will be
67                 params      = None,                  # Add extra params for the command line
68                 _out        = None,                  # Store the stdout at this path
69                 _err        = None):                 # Store the stderr at this path
70        # Save attributes #
71        self.input_fasta = input_fasta
72        self.database    = database
73        self.seq_type    = seq_type
74        self.algorithm   = algorithm
75        self.num_threads = num_threads
76        self.filtering   = filtering
77        self.out_path    = out_path
78        self.params      = params
79        self._out        = _out
80        self._err        = _err
81        # Assign default values #
82        self.set_defaults()
83        # Validate attributes #
84        self.validate()
def set_defaults(self):
 86    def set_defaults(self):
 87        """
 88        This method will replace empty attributes with defaults when this is
 89        needed.
 90        """
 91        # In case we got a special object, just use the blast_db attribute #
 92        if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'):
 93            self.database = self.database.blast_db
 94        if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'):
 95            self.database = self.database.vsearch_db
 96        # Otherwise in case we got a path, convert it to a BLASTdb #
 97        if self.algorithm == 'blast' and not isinstance(self.database, BLASTdb):
 98            self.database = BLASTdb(self.database)
 99        # The filtering options #
100        if self.filtering is None: self.filtering = {}
101        # Output path default value #
102        if self.out_path is None:
103            self.out_path = self.input_fasta.prefix_path + '.' + \
104                            self.algorithm + 'out'
105        # Output path setting #
106        self.out_path = FilePath(self.out_path)
107        # Number of cores default value #
108        if self.num_threads is None or self.num_threads is True:
109            self.num_threads = min(multiprocessing.cpu_count(), 32)
110        # Extra params to be given to the search algorithm #
111        if self.params is None: self.params = {}

This method will replace empty attributes with defaults when this is needed.

def validate(self):
113    def validate(self):
114        """
115        This method will raise an Exception is any of the arguments passed by
116        the user are illegal.
117        """
118        # The FASTA file has to contain something #
119        assert self.input_fasta

This method will raise an Exception is any of the arguments passed by the user are illegal.

query

The actual search object with all the relevant parameters.

def run(self):
133    def run(self):
134        """Run the search."""
135        return self.query.run()

Run the search.

def filter(self):
137    def filter(self):
138        """Filter the results accordingly."""
139        return self.query.filter(self.filtering)

Filter the results accordingly.

results

Parse the results in a meaningful manner.

blast_params

A dictionary of options to pass to the blast executable. These params should depend on the filtering options.

def select_blast_algo(self):
164    def select_blast_algo(self):
165        """Depends on the query type and the database type."""
166        # Pick the right executable #
167        db_type = self.database.seq_type
168        if self.seq_type == 'nucl' and db_type == 'nucl': return 'blastn'
169        if self.seq_type == 'prot' and db_type == 'prot': return 'blastp'
170        if self.seq_type == 'nucl' and db_type == 'prot': return 'blastx'
171        if self.seq_type == 'prot' and db_type == 'nucl': return 'tblastn'

Depends on the query type and the database type.

blast_query

Make a BLAST search object.

vsearch_params

A dictionary of options to pass to the vsearch executable. These params should depend on the filtering options.

vsearch_query

Make a VSEARCH search object.

hmmer_query

Make a HMMER search object.