seqsearch.search
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11import multiprocessing 12 13# First party modules # 14from seqsearch.search.blast import BLASTquery, BLASTdb 15from seqsearch.search.vsearch import VSEARCHquery 16from seqsearch.search.hmmer import HmmQuery 17from plumbing.cache import property_cached 18from autopaths.file_path import FilePath 19 20################################################################################ 21class SeqSearch(object): 22 """ 23 A sequence similarity search. 24 Is able to use different algorithms such as BLAST, VSEARCH, HMMER, BLAT etc. 25 all through the same interface. 26 27 Input: - Series of sequences in a FASTA file. 28 - A database to search against. 29 - The type of the sequences ('prot' or 'nucl'). 30 - The type of algorithm to use. Currently BLAST, VSEARCH or HMMER. 31 - Number of threads to use. 32 - The desired output path. 33 - An extra set of parameters to be given to the search command. 34 - The filtering options: 35 * BLAST supported: - e-value 36 - Maximum targets 37 - Minimum identity (via manual output format) 38 - Minimum query coverage (via manual output format) 39 * VSEARCH supported: - Maximum targets 40 * HMMER supported: - e-value 41 42 Output: - List of identifiers in the database 43 (object with significance value and identity attached) 44 45 Other possible algorithms: 46 47 * https://www.ncbi.nlm.nih.gov/pubmed/11932250 48 * https://www.animalgenome.org/bioinfo/resources/manuals/wu-blast/ 49 * https://github.com/bbuchfink/diamond 50 """ 51 52 def __repr__(self): 53 return '<%s object on %s>' % (self.__class__.__name__, self.input_fasta) 54 55 def __bool__(self): 56 return bool(self.query) 57 58 def __init__(self, 59 input_fasta, 60 database, 61 seq_type = 'prot' or 'nucl', # What sequence type is the input fasta 62 algorithm = 'blast' or 'vsearch', # Which implementation do you want 63 num_threads = None, # How many processes to use 64 filtering = None, # The result filtering options 65 out_path = None, # Where the out file will be 66 params = None, # Add extra params for the command line 67 _out = None, # Store the stdout at this path 68 _err = None): # Store the stderr at this path 69 # Save attributes # 70 self.input_fasta = input_fasta 71 self.database = database 72 self.seq_type = seq_type 73 self.algorithm = algorithm 74 self.num_threads = num_threads 75 self.filtering = filtering 76 self.out_path = out_path 77 self.params = params 78 self._out = _out 79 self._err = _err 80 # Assign default values # 81 self.set_defaults() 82 # Validate attributes # 83 self.validate() 84 85 def set_defaults(self): 86 """ 87 This method will replace empty attributes with defaults when this is 88 needed. 89 """ 90 # In case we got a special object, just use the blast_db attribute # 91 if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'): 92 self.database = self.database.blast_db 93 if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'): 94 self.database = self.database.vsearch_db 95 # Otherwise in case we got a path, convert it to a BLASTdb # 96 if self.algorithm == 'blast' and not isinstance(self.database, BLASTdb): 97 self.database = BLASTdb(self.database) 98 # The filtering options # 99 if self.filtering is None: self.filtering = {} 100 # Output path default value # 101 if self.out_path is None: 102 self.out_path = self.input_fasta.prefix_path + '.' + \ 103 self.algorithm + 'out' 104 # Output path setting # 105 self.out_path = FilePath(self.out_path) 106 # Number of cores default value # 107 if self.num_threads is None or self.num_threads is True: 108 self.num_threads = min(multiprocessing.cpu_count(), 32) 109 # Extra params to be given to the search algorithm # 110 if self.params is None: self.params = {} 111 112 def validate(self): 113 """ 114 This method will raise an Exception is any of the arguments passed by 115 the user are illegal. 116 """ 117 # The FASTA file has to contain something # 118 assert self.input_fasta 119 120 @property 121 def query(self): 122 """The actual search object with all the relevant parameters.""" 123 # Pick the right attribute # 124 if self.algorithm == 'blast': return self.blast_query 125 if self.algorithm == 'vsearch': return self.vsearch_query 126 if self.algorithm == 'hmmer': return self.hmmer_query 127 # Otherwise raise an exception # 128 msg = "The algorithm '%s' is not supported." 129 raise NotImplemented(msg % self.algorithm) 130 131 #------------------------ Methods that are forwarded ----------------------# 132 def run(self): 133 """Run the search.""" 134 return self.query.run() 135 136 def filter(self): 137 """Filter the results accordingly.""" 138 return self.query.filter(self.filtering) 139 140 @property 141 def results(self): 142 """Parse the results in a meaningful manner.""" 143 return self.query.results 144 145 #-------------------------- BLAST IMPLEMENTATION -------------------------# 146 @property_cached 147 def blast_params(self): 148 """ 149 A dictionary of options to pass to the blast executable. 150 These params should depend on the filtering options. 151 """ 152 # Make a copy # 153 params = self.params.copy() 154 # Based on the e-value # 155 if 'e_value' in self.filtering: 156 params['-evalue'] = self.filtering['e_value'] 157 # Based on the maximum number of hits # 158 if 'max_targets' in self.filtering: 159 params['-max_target_seqs'] = self.filtering['max_targets'] 160 # Return # 161 return params 162 163 def select_blast_algo(self): 164 """Depends on the query type and the database type.""" 165 # Pick the right executable # 166 db_type = self.database.seq_type 167 if self.seq_type == 'nucl' and db_type == 'nucl': return 'blastn' 168 if self.seq_type == 'prot' and db_type == 'prot': return 'blastp' 169 if self.seq_type == 'nucl' and db_type == 'prot': return 'blastx' 170 if self.seq_type == 'prot' and db_type == 'nucl': return 'tblastn' 171 172 @property_cached 173 def blast_query(self): 174 """Make a BLAST search object.""" 175 # Make the query object # 176 return BLASTquery(query_path = self.input_fasta, 177 db_path = self.database, 178 seq_type = self.seq_type, 179 params = self.blast_params, 180 algorithm = self.select_blast_algo(), 181 cpus = self.num_threads, 182 out_path = self.out_path, 183 _out = self._out, 184 _err = self._err) 185 186 #------------------------- VSEARCH IMPLEMENTATION ------------------------# 187 @property_cached 188 def vsearch_params(self): 189 """ 190 A dictionary of options to pass to the vsearch executable. 191 These params should depend on the filtering options. 192 """ 193 # Make a copy # 194 params = self.params.copy() 195 # Based on the maximum number of hits # 196 if 'max_targets' in self.filtering: 197 params['-maxaccepts'] = self.filtering['max_targets'] 198 # Return # 199 return params 200 201 @property_cached 202 def vsearch_query(self): 203 """Make a VSEARCH search object.""" 204 return VSEARCHquery(query_path = self.input_fasta, 205 db_path = self.database, 206 seq_type = self.seq_type, 207 params = self.vsearch_params, 208 algorithm = "usearch_global", 209 cpus = self.num_threads, 210 out_path = self.out_path, 211 _out = self._out, 212 _err = self._err) 213 214 #-------------------------- HMMER IMPLEMENTATION -------------------------# 215 @property_cached 216 def hmmer_params(self): 217 params = self.params.copy() 218 if 'e_value' in self.filtering: params['-E'] = self.filtering['e_value'] 219 return params 220 221 @property_cached 222 def hmmer_query(self): 223 """Make a HMMER search object.""" 224 return HmmQuery(query_path = self.input_fasta, 225 db_path = self.database, 226 seq_type = self.seq_type, 227 params = self.hmmer_params, 228 cpus = self.num_threads, 229 out_path = self.out_path)
22class SeqSearch(object): 23 """ 24 A sequence similarity search. 25 Is able to use different algorithms such as BLAST, VSEARCH, HMMER, BLAT etc. 26 all through the same interface. 27 28 Input: - Series of sequences in a FASTA file. 29 - A database to search against. 30 - The type of the sequences ('prot' or 'nucl'). 31 - The type of algorithm to use. Currently BLAST, VSEARCH or HMMER. 32 - Number of threads to use. 33 - The desired output path. 34 - An extra set of parameters to be given to the search command. 35 - The filtering options: 36 * BLAST supported: - e-value 37 - Maximum targets 38 - Minimum identity (via manual output format) 39 - Minimum query coverage (via manual output format) 40 * VSEARCH supported: - Maximum targets 41 * HMMER supported: - e-value 42 43 Output: - List of identifiers in the database 44 (object with significance value and identity attached) 45 46 Other possible algorithms: 47 48 * https://www.ncbi.nlm.nih.gov/pubmed/11932250 49 * https://www.animalgenome.org/bioinfo/resources/manuals/wu-blast/ 50 * https://github.com/bbuchfink/diamond 51 """ 52 53 def __repr__(self): 54 return '<%s object on %s>' % (self.__class__.__name__, self.input_fasta) 55 56 def __bool__(self): 57 return bool(self.query) 58 59 def __init__(self, 60 input_fasta, 61 database, 62 seq_type = 'prot' or 'nucl', # What sequence type is the input fasta 63 algorithm = 'blast' or 'vsearch', # Which implementation do you want 64 num_threads = None, # How many processes to use 65 filtering = None, # The result filtering options 66 out_path = None, # Where the out file will be 67 params = None, # Add extra params for the command line 68 _out = None, # Store the stdout at this path 69 _err = None): # Store the stderr at this path 70 # Save attributes # 71 self.input_fasta = input_fasta 72 self.database = database 73 self.seq_type = seq_type 74 self.algorithm = algorithm 75 self.num_threads = num_threads 76 self.filtering = filtering 77 self.out_path = out_path 78 self.params = params 79 self._out = _out 80 self._err = _err 81 # Assign default values # 82 self.set_defaults() 83 # Validate attributes # 84 self.validate() 85 86 def set_defaults(self): 87 """ 88 This method will replace empty attributes with defaults when this is 89 needed. 90 """ 91 # In case we got a special object, just use the blast_db attribute # 92 if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'): 93 self.database = self.database.blast_db 94 if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'): 95 self.database = self.database.vsearch_db 96 # Otherwise in case we got a path, convert it to a BLASTdb # 97 if self.algorithm == 'blast' and not isinstance(self.database, BLASTdb): 98 self.database = BLASTdb(self.database) 99 # The filtering options # 100 if self.filtering is None: self.filtering = {} 101 # Output path default value # 102 if self.out_path is None: 103 self.out_path = self.input_fasta.prefix_path + '.' + \ 104 self.algorithm + 'out' 105 # Output path setting # 106 self.out_path = FilePath(self.out_path) 107 # Number of cores default value # 108 if self.num_threads is None or self.num_threads is True: 109 self.num_threads = min(multiprocessing.cpu_count(), 32) 110 # Extra params to be given to the search algorithm # 111 if self.params is None: self.params = {} 112 113 def validate(self): 114 """ 115 This method will raise an Exception is any of the arguments passed by 116 the user are illegal. 117 """ 118 # The FASTA file has to contain something # 119 assert self.input_fasta 120 121 @property 122 def query(self): 123 """The actual search object with all the relevant parameters.""" 124 # Pick the right attribute # 125 if self.algorithm == 'blast': return self.blast_query 126 if self.algorithm == 'vsearch': return self.vsearch_query 127 if self.algorithm == 'hmmer': return self.hmmer_query 128 # Otherwise raise an exception # 129 msg = "The algorithm '%s' is not supported." 130 raise NotImplemented(msg % self.algorithm) 131 132 #------------------------ Methods that are forwarded ----------------------# 133 def run(self): 134 """Run the search.""" 135 return self.query.run() 136 137 def filter(self): 138 """Filter the results accordingly.""" 139 return self.query.filter(self.filtering) 140 141 @property 142 def results(self): 143 """Parse the results in a meaningful manner.""" 144 return self.query.results 145 146 #-------------------------- BLAST IMPLEMENTATION -------------------------# 147 @property_cached 148 def blast_params(self): 149 """ 150 A dictionary of options to pass to the blast executable. 151 These params should depend on the filtering options. 152 """ 153 # Make a copy # 154 params = self.params.copy() 155 # Based on the e-value # 156 if 'e_value' in self.filtering: 157 params['-evalue'] = self.filtering['e_value'] 158 # Based on the maximum number of hits # 159 if 'max_targets' in self.filtering: 160 params['-max_target_seqs'] = self.filtering['max_targets'] 161 # Return # 162 return params 163 164 def select_blast_algo(self): 165 """Depends on the query type and the database type.""" 166 # Pick the right executable # 167 db_type = self.database.seq_type 168 if self.seq_type == 'nucl' and db_type == 'nucl': return 'blastn' 169 if self.seq_type == 'prot' and db_type == 'prot': return 'blastp' 170 if self.seq_type == 'nucl' and db_type == 'prot': return 'blastx' 171 if self.seq_type == 'prot' and db_type == 'nucl': return 'tblastn' 172 173 @property_cached 174 def blast_query(self): 175 """Make a BLAST search object.""" 176 # Make the query object # 177 return BLASTquery(query_path = self.input_fasta, 178 db_path = self.database, 179 seq_type = self.seq_type, 180 params = self.blast_params, 181 algorithm = self.select_blast_algo(), 182 cpus = self.num_threads, 183 out_path = self.out_path, 184 _out = self._out, 185 _err = self._err) 186 187 #------------------------- VSEARCH IMPLEMENTATION ------------------------# 188 @property_cached 189 def vsearch_params(self): 190 """ 191 A dictionary of options to pass to the vsearch executable. 192 These params should depend on the filtering options. 193 """ 194 # Make a copy # 195 params = self.params.copy() 196 # Based on the maximum number of hits # 197 if 'max_targets' in self.filtering: 198 params['-maxaccepts'] = self.filtering['max_targets'] 199 # Return # 200 return params 201 202 @property_cached 203 def vsearch_query(self): 204 """Make a VSEARCH search object.""" 205 return VSEARCHquery(query_path = self.input_fasta, 206 db_path = self.database, 207 seq_type = self.seq_type, 208 params = self.vsearch_params, 209 algorithm = "usearch_global", 210 cpus = self.num_threads, 211 out_path = self.out_path, 212 _out = self._out, 213 _err = self._err) 214 215 #-------------------------- HMMER IMPLEMENTATION -------------------------# 216 @property_cached 217 def hmmer_params(self): 218 params = self.params.copy() 219 if 'e_value' in self.filtering: params['-E'] = self.filtering['e_value'] 220 return params 221 222 @property_cached 223 def hmmer_query(self): 224 """Make a HMMER search object.""" 225 return HmmQuery(query_path = self.input_fasta, 226 db_path = self.database, 227 seq_type = self.seq_type, 228 params = self.hmmer_params, 229 cpus = self.num_threads, 230 out_path = self.out_path)
A sequence similarity search. Is able to use different algorithms such as BLAST, VSEARCH, HMMER, BLAT etc. all through the same interface.
Input: - Series of sequences in a FASTA file. - A database to search against. - The type of the sequences ('prot' or 'nucl'). - The type of algorithm to use. Currently BLAST, VSEARCH or HMMER. - Number of threads to use. - The desired output path. - An extra set of parameters to be given to the search command. - The filtering options: * BLAST supported: - e-value - Maximum targets - Minimum identity (via manual output format) - Minimum query coverage (via manual output format) * VSEARCH supported: - Maximum targets * HMMER supported: - e-value
Output: - List of identifiers in the database (object with significance value and identity attached)
Other possible algorithms:
* https://www.ncbi.nlm.nih.gov/pubmed/11932250
* https://www.animalgenome.org/bioinfo/resources/manuals/wu-blast/
* https://github.com/bbuchfink/diamond
59 def __init__(self, 60 input_fasta, 61 database, 62 seq_type = 'prot' or 'nucl', # What sequence type is the input fasta 63 algorithm = 'blast' or 'vsearch', # Which implementation do you want 64 num_threads = None, # How many processes to use 65 filtering = None, # The result filtering options 66 out_path = None, # Where the out file will be 67 params = None, # Add extra params for the command line 68 _out = None, # Store the stdout at this path 69 _err = None): # Store the stderr at this path 70 # Save attributes # 71 self.input_fasta = input_fasta 72 self.database = database 73 self.seq_type = seq_type 74 self.algorithm = algorithm 75 self.num_threads = num_threads 76 self.filtering = filtering 77 self.out_path = out_path 78 self.params = params 79 self._out = _out 80 self._err = _err 81 # Assign default values # 82 self.set_defaults() 83 # Validate attributes # 84 self.validate()
86 def set_defaults(self): 87 """ 88 This method will replace empty attributes with defaults when this is 89 needed. 90 """ 91 # In case we got a special object, just use the blast_db attribute # 92 if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'): 93 self.database = self.database.blast_db 94 if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'): 95 self.database = self.database.vsearch_db 96 # Otherwise in case we got a path, convert it to a BLASTdb # 97 if self.algorithm == 'blast' and not isinstance(self.database, BLASTdb): 98 self.database = BLASTdb(self.database) 99 # The filtering options # 100 if self.filtering is None: self.filtering = {} 101 # Output path default value # 102 if self.out_path is None: 103 self.out_path = self.input_fasta.prefix_path + '.' + \ 104 self.algorithm + 'out' 105 # Output path setting # 106 self.out_path = FilePath(self.out_path) 107 # Number of cores default value # 108 if self.num_threads is None or self.num_threads is True: 109 self.num_threads = min(multiprocessing.cpu_count(), 32) 110 # Extra params to be given to the search algorithm # 111 if self.params is None: self.params = {}
This method will replace empty attributes with defaults when this is needed.
113 def validate(self): 114 """ 115 This method will raise an Exception is any of the arguments passed by 116 the user are illegal. 117 """ 118 # The FASTA file has to contain something # 119 assert self.input_fasta
This method will raise an Exception is any of the arguments passed by the user are illegal.
137 def filter(self): 138 """Filter the results accordingly.""" 139 return self.query.filter(self.filtering)
Filter the results accordingly.
A dictionary of options to pass to the blast executable. These params should depend on the filtering options.
164 def select_blast_algo(self): 165 """Depends on the query type and the database type.""" 166 # Pick the right executable # 167 db_type = self.database.seq_type 168 if self.seq_type == 'nucl' and db_type == 'nucl': return 'blastn' 169 if self.seq_type == 'prot' and db_type == 'prot': return 'blastp' 170 if self.seq_type == 'nucl' and db_type == 'prot': return 'blastx' 171 if self.seq_type == 'prot' and db_type == 'nucl': return 'tblastn'
Depends on the query type and the database type.