crest4.classify

Written by Lucas Sinclair. GNUv3 Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6GNUv3 Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import os, multiprocessing
 12
 13# Internal modules #
 14import crest4
 15import crest4.databases
 16from crest4.query import Query
 17from crest4.databases import CrestDatabase
 18
 19# First party modules #
 20from functools import cached_property
 21from autopaths.file_path import FilePath
 22from autopaths.dir_path  import DirectoryPath
 23
 24# Constants #
 25all_db_choices = ('midori253darn', 'silvamod138pr2', 'mitofish', 'ssuome')
 26
 27###############################################################################
 28class Classify:
 29    """
 30    This is the main object offered by the `crest4` package.
 31    It enables you to automatically assign taxonomic names to DNA sequences
 32    obtained from environmental sequencing.
 33
 34    After creating a new instance of a `Classify` object, you can simply call
 35    it to get your input data processed and the assignments output file
 36    generated. Examples are included in the `README.md` file of this package,
 37    which can directly be seen on the GitHub page at:
 38
 39    https://github.com/xapple/crest4/
 40    """
 41
 42    def __init__(self,
 43                 fasta,
 44                 search_algo = 'blast',
 45                 num_threads = 1,
 46                 search_db   = 'ssuome',
 47                 output_dir  = None,
 48                 search_hits = None,
 49                 min_score   = None,
 50                 score_drop  = 2.0,
 51                 min_smlrty  = True,
 52                 otu_table   = None,
 53                 ):
 54        """
 55        Args:
 56
 57            fasta: The path to a single FASTA file as a string.
 58                   These are the sequences that will be taxonomically
 59                   classified.
 60
 61            search_algo: The algorithm used for the sequence similarity search
 62                         that will be run to match the sequences against the
 63                         database chosen. Either `blast` or `vsearch`. No
 64                         other values are currently supported. By default,
 65                         `blast`.
 66
 67            num_threads: The number of processors to use for the sequence
 68                         similarity search. By default, parallelism is turned
 69                         off, and this value is 1. If you pass the value
 70                         `True,` we will run as many processes as there are
 71                         CPUs but no more than 32.
 72
 73            search_db: The database used for the sequence similarity search.
 74                       Either `ssuome`, `silvamod138pr2`, 'mitofish',
 75                       or `midori253darn`. By default, `ssuome`.
 76                       Optionally, the user can provide a custom database by
 77                       specifying the full path to a directory containing all
 78                       required files under `search_db`.
 79                       See the README for more information.
 80
 81            output_dir: The directory into which all the classification
 82                        results will be written to. This defaults to a
 83                        directory with the same name as the original FASTA
 84                        file and a `.crest4` suffix appended.
 85
 86            search_hits: The path where the search results will be stored.
 87                         This defaults to the output directory. However,
 88                         if the search operation has already been completed
 89                         beforehand, specify the path here to skip the
 90                         sequence similarity search step and go directly to
 91                         the taxonomy step. If a hits file exists in the output
 92                         directory and this option is not specified, it is
 93                         deleted and regenerated.
 94
 95            min_score: The minimum bit-score for a search hit to be considered
 96                       when using BLAST as the search algorithm. All hits below
 97                       this score are ignored. When using VSEARCH, this value
 98                       instead indicates the minimum identity between two
 99                       sequences for the hit to be considered.
100                       The default is `155` for BLAST and `0.75` for VSEARCH.
101
102            score_drop: Determines the range of hits to retain and the range
103                        to discard based on a drop in percentage from the score
104                        of the best hit. Any hit below the following value:
105                        "(100 - score_drop)/100 * best_hit_score" is ignored.
106                        By default, `2.0`.
107
108            min_smlrty: Determines if the minimum similarity filter is turned
109                        on or off. Pass any value like `False` to turn it off.
110                        The minimum similarity filter prevents classification
111                        to higher ranks when a minimum rank-identity is not
112                        met. The default is `True`.
113
114            otu_table: Optionally, one can specify the path to an existing OTU
115                       table in CSV or TSV format when running `crest4`.
116                       The sequence names in the OTU table must be rows and
117                       have to match the names in the FASTA file. The column,
118                       on the other hand, provide your samples names.
119                       When this option is used, then two extra output files
120                       are generated. Firstly, a table summarizing the
121                       assignment counts per taxa. Secondly, a table
122                       propagating the sequence counts upwards
123                       in a cumulative fashion.
124                       """
125        # Save attributes #
126        self.fasta       = fasta
127        self.search_algo = search_algo
128        self.num_threads = num_threads
129        self.search_db   = search_db
130        self.output_dir  = output_dir
131        self.search_hits = search_hits
132        self.min_score   = min_score
133        self.score_drop  = score_drop
134        self.min_smlrty  = min_smlrty
135        self.otu_table   = otu_table
136        # Assign default values and change others #
137        self.transform()
138        # Validate attributes #
139        self.validate()
140
141    def transform(self):
142        """
143        This method will replace empty attributes with defaults when this is
144        needed and will convert others to proper types.
145        """
146        # The fasta should be a FASTA object #
147        if self.fasta is not None:
148            from fasta import FASTA
149            self.fasta = FASTA(self.fasta)
150        # Default for the number of threads #
151        if not isinstance(self.num_threads, int):
152            if self.num_threads is True:
153                self.num_threads = min(multiprocessing.cpu_count(), 32)
154            elif self.num_threads is False:
155                self.num_threads = 1
156            elif self.num_threads.lower() == 'true':
157                self.num_threads = min(multiprocessing.cpu_count(), 32)
158        # The database is always a string #
159        self.search_db = str(self.search_db)
160        # Default for the output directory #
161        if self.output_dir is None:
162            self.output_dir = self.fasta + '.crest4/'
163        self.output_dir = DirectoryPath(self.output_dir)
164        # The search hits is a file somewhere if passed #
165        if self.search_hits is not None:
166            self.search_hits = FilePath(self.search_hits)
167        # Default for the search hits file if not passed #
168        if self.search_hits is None:
169            self.search_hits = FilePath(self.output_dir + 'search.hits')
170            self.search_hits.remove()
171        # Default for the minimum score #
172        if self.min_score is None:
173            if self.search_algo == 'blast':
174                self.min_score = 155
175            if self.search_algo == 'vsearch':
176                self.min_score = 0.75
177        # The minimum score has to be a number, not a string #
178        try:
179            self.min_score = float(self.min_score)
180        except (ValueError, TypeError):
181            msg = "The minimum score value must be numerical (not '%s')."
182            raise ValueError(msg % self.min_score)
183        # The score drop has to be a float, not a string #
184        try:
185            self.score_drop = float(self.score_drop)
186        except (ValueError, TypeError):
187            msg = "The score drop value must be numerical (not '%s')."
188            raise ValueError(msg % self.score_drop)
189        # Turn off the minimum similarity filter if the user passed any value #
190        if self.min_smlrty is not True: self.min_smlrty = False
191        # The OTU table is a file somewhere if passed #
192        if self.otu_table is not None:
193            self.otu_table = FilePath(self.otu_table)
194            self.otu_table.must_exist()
195
196    def validate(self):
197        """
198        This method will raise an Exception if any of the arguments passed by
199        the user are illegal.
200        """
201        # The fasta should exist if passed #
202        if self.fasta is not None:
203            self.fasta.must_exist()
204        # Either the FASTA file or the hits file has to contain something #
205        if not self.fasta and not self.search_hits:
206            msg = "Neither the FASTA file at '%s' nor the search hits file" \
207                  " at '%s' contain any data. Cannot proceed."
208            raise Exception(msg % (self.fasta, self.search_hits))
209        # Check the search algorithm #
210        if self.search_algo not in ('blast', 'vsearch'):
211            msg = "The search algorithm '%s' is not supported."
212            raise ValueError(msg % self.search_algo)
213        # The search database is a known entry or exists on the filesystem #
214        if self.search_db not in all_db_choices:
215            if not os.path.exists(self.search_db):
216                msg = "The search database '%s' is not supported."
217                raise ValueError(msg % self.search_db)
218        # Check the minimum score value is above zero #
219        if self.min_score < 0.0:
220            msg = "The minimum score cannot be smaller than zero ('%s')."
221            raise ValueError(msg % self.min_score)
222        # Check the minimum score value is below one #
223        if self.min_score > 1.0:
224            if self.search_algo == 'vsearch':
225                msg = "The minimum score cannot be more than 1.0 when" \
226                      " using VSEARCH ('%s') because it represents the" \
227                      " the minimum identity between two sequences."
228                raise ValueError(msg % self.min_score)
229        # Check the score drop value #
230        if self.score_drop < 0.0:
231            msg = "The score drop value cannot be smaller than zero ('%s')."
232            raise ValueError(msg % self.min_score)
233        if self.score_drop > 100.0:
234            msg = "The score drop value cannot be over 100 ('%s')."
235            raise ValueError(msg % self.min_score)
236
237    def __repr__(self):
238        """A simple representation of this object to avoid memory addresses."""
239        return "<%s object on '%s'>" % (self.__class__.__name__, self.fasta)
240
241    @cached_property
242    def database(self):
243        """
244        Retrieve the database object that the user has selected.
245        This can be either a standard database or a custom-specified path.
246        """
247        if self.search_db not in all_db_choices:
248            # Take the absolute path #
249            user_path = os.path.abspath(self.search_db)
250            # Make the object #
251            return CrestDatabase(custom_path=user_path)
252        else:
253            return getattr(crest4.databases, self.search_db)
254
255    #------------------------------ Searching --------------------------------#
256    @cached_property
257    def seqsearch(self):
258        """
259        An object representing the sequence similarity search.
260        Makes use of the `seqsearch` module. For reference:
261
262        * Setting `-outfmt` to 5 means XML output.
263        * Setting `-outfmt` to 6 means tabular output.
264        * Setting `-outfmt` to 7 means tabular output with comments.
265        """
266        # Initialize #
267        params = {}
268        # If the user chose BLAST then we have to specify tabular output #
269        if self.search_algo == 'blast':
270            params = {'-outfmt': '7 qseqid sseqid bitscore length nident'}
271        # In case the user chose VSEARCH we specify the minimum identity
272        # and the minimum sequence match length
273        if self.search_algo == 'vsearch':
274            params = {'--id':      self.min_score,
275                      '--mincols': 25}
276        # Build and return the object #
277        from seqsearch.search import SeqSearch
278        return SeqSearch(input_fasta = self.fasta,
279                         database    = self.database,
280                         seq_type    = 'nucl',
281                         algorithm   = self.search_algo,
282                         filtering   = {'max_targets': 100},
283                         num_threads = self.num_threads,
284                         out_path    = self.search_hits,
285                         params      = params)
286
287    def search(self):
288        """A method to launch the sequence similarity search."""
289        # Launch the search algorithm #
290        return self.seqsearch.run()
291
292    #----------------------------- Assigning ---------------------------------#
293    @cached_property
294    def score_frac(self):
295        """
296        Using the parameter `self.score_drop`, which is a percentage (e.g., 2)
297        indicating a drop, we compute the minimum remaining amount of score
298        allowed, as a fraction (e.g., 0.98).
299        """
300        return 1 - (self.score_drop / 100)
301
302    @cached_property
303    def queries(self):
304        """
305        Parses the output of the sequence search program used which returns
306        a list of `QueryResult` objects from the `SearchIO.parse` module.
307        The list contains one Query object per sequence that was originally
308        inputted. Use these objects to access the taxonomic assignments.
309        """
310        # Check if the search has been done already #
311        if not self.search_hits: self.search()
312        # Iterate on the sequence search results #
313        result = [Query(self, query) for query in self.seqsearch.results]
314        # VSEARCH entirely forgets about sequences that had no hits.
315        # Instead of still listing them in the output like BLAST.
316        # So we have to add them back to the list in this awkward manner
317        if self.search_algo == 'vsearch':
318            reported_names = set(query.name for query in result)
319            for seq in self.fasta:
320                if seq.id not in reported_names:
321                    q = type('FakeQuery', (), {'hits': [], 'id': seq.id})
322                    result.append(Query(self, q))
323        # Return #
324        return result
325
326    @cached_property
327    def queries_by_id(self):
328        """
329        References the same Query objects as the `queries` property above,
330        except that this time they are in a dictionary with the query ids
331        (i.e., the original fasta ids) as keys instead of in a list.
332        """
333        return {query.name: query for query in self.queries}
334
335    #------------------------------- Outputs ---------------------------------#
336    @cached_property
337    def out_file(self):
338        """
339        The path to the file that will contain the taxonomic assignments
340        for every sequence.
341        """
342        # Make sure that the output directory exists #
343        self.output_dir.create_if_not_exists()
344        # Return #
345        return self.output_dir + "assignments.txt"
346
347    @cached_property
348    def otu_info(self):
349        """An object giving access to the OTU table information and methods."""
350        from crest4.otu_tables import InfoFromTableOTUs
351        return InfoFromTableOTUs(self, self.otu_table)
352
353    def __call__(self):
354        """Generate outputs."""
355        # Intro message #
356        print('Running crest4 version ' + crest4.__version__)
357        # Iterate #
358        self.out_file.writelines(query.tax_string for query in self.queries)
359        # Special case where an OTU table was passed #
360        if self.otu_table:
361            path_by_rank    = self.output_dir + 'otus_by_rank.tsv'
362            path_cumulative = self.output_dir + 'otus_cumulative.tsv'
363            otus_by_rank    = self.otu_info.otus_by_rank
364            otus_cumulative = self.otu_info.otus_cumulative
365            otus_by_rank.to_csv(path_by_rank, index=False, sep='\t')
366            otus_cumulative.to_csv(path_cumulative, index=False, sep='\t')
367        # Print a success message #
368        msg = "Classification ran successfully. Results are placed in '%s'."
369        print(msg % self.out_file)
370        # Return #
371        return self.out_file
all_db_choices = ('midori253darn', 'silvamod138pr2', 'mitofish', 'ssuome')
class Classify:
 29class Classify:
 30    """
 31    This is the main object offered by the `crest4` package.
 32    It enables you to automatically assign taxonomic names to DNA sequences
 33    obtained from environmental sequencing.
 34
 35    After creating a new instance of a `Classify` object, you can simply call
 36    it to get your input data processed and the assignments output file
 37    generated. Examples are included in the `README.md` file of this package,
 38    which can directly be seen on the GitHub page at:
 39
 40    https://github.com/xapple/crest4/
 41    """
 42
 43    def __init__(self,
 44                 fasta,
 45                 search_algo = 'blast',
 46                 num_threads = 1,
 47                 search_db   = 'ssuome',
 48                 output_dir  = None,
 49                 search_hits = None,
 50                 min_score   = None,
 51                 score_drop  = 2.0,
 52                 min_smlrty  = True,
 53                 otu_table   = None,
 54                 ):
 55        """
 56        Args:
 57
 58            fasta: The path to a single FASTA file as a string.
 59                   These are the sequences that will be taxonomically
 60                   classified.
 61
 62            search_algo: The algorithm used for the sequence similarity search
 63                         that will be run to match the sequences against the
 64                         database chosen. Either `blast` or `vsearch`. No
 65                         other values are currently supported. By default,
 66                         `blast`.
 67
 68            num_threads: The number of processors to use for the sequence
 69                         similarity search. By default, parallelism is turned
 70                         off, and this value is 1. If you pass the value
 71                         `True,` we will run as many processes as there are
 72                         CPUs but no more than 32.
 73
 74            search_db: The database used for the sequence similarity search.
 75                       Either `ssuome`, `silvamod138pr2`, 'mitofish',
 76                       or `midori253darn`. By default, `ssuome`.
 77                       Optionally, the user can provide a custom database by
 78                       specifying the full path to a directory containing all
 79                       required files under `search_db`.
 80                       See the README for more information.
 81
 82            output_dir: The directory into which all the classification
 83                        results will be written to. This defaults to a
 84                        directory with the same name as the original FASTA
 85                        file and a `.crest4` suffix appended.
 86
 87            search_hits: The path where the search results will be stored.
 88                         This defaults to the output directory. However,
 89                         if the search operation has already been completed
 90                         beforehand, specify the path here to skip the
 91                         sequence similarity search step and go directly to
 92                         the taxonomy step. If a hits file exists in the output
 93                         directory and this option is not specified, it is
 94                         deleted and regenerated.
 95
 96            min_score: The minimum bit-score for a search hit to be considered
 97                       when using BLAST as the search algorithm. All hits below
 98                       this score are ignored. When using VSEARCH, this value
 99                       instead indicates the minimum identity between two
100                       sequences for the hit to be considered.
101                       The default is `155` for BLAST and `0.75` for VSEARCH.
102
103            score_drop: Determines the range of hits to retain and the range
104                        to discard based on a drop in percentage from the score
105                        of the best hit. Any hit below the following value:
106                        "(100 - score_drop)/100 * best_hit_score" is ignored.
107                        By default, `2.0`.
108
109            min_smlrty: Determines if the minimum similarity filter is turned
110                        on or off. Pass any value like `False` to turn it off.
111                        The minimum similarity filter prevents classification
112                        to higher ranks when a minimum rank-identity is not
113                        met. The default is `True`.
114
115            otu_table: Optionally, one can specify the path to an existing OTU
116                       table in CSV or TSV format when running `crest4`.
117                       The sequence names in the OTU table must be rows and
118                       have to match the names in the FASTA file. The column,
119                       on the other hand, provide your samples names.
120                       When this option is used, then two extra output files
121                       are generated. Firstly, a table summarizing the
122                       assignment counts per taxa. Secondly, a table
123                       propagating the sequence counts upwards
124                       in a cumulative fashion.
125                       """
126        # Save attributes #
127        self.fasta       = fasta
128        self.search_algo = search_algo
129        self.num_threads = num_threads
130        self.search_db   = search_db
131        self.output_dir  = output_dir
132        self.search_hits = search_hits
133        self.min_score   = min_score
134        self.score_drop  = score_drop
135        self.min_smlrty  = min_smlrty
136        self.otu_table   = otu_table
137        # Assign default values and change others #
138        self.transform()
139        # Validate attributes #
140        self.validate()
141
142    def transform(self):
143        """
144        This method will replace empty attributes with defaults when this is
145        needed and will convert others to proper types.
146        """
147        # The fasta should be a FASTA object #
148        if self.fasta is not None:
149            from fasta import FASTA
150            self.fasta = FASTA(self.fasta)
151        # Default for the number of threads #
152        if not isinstance(self.num_threads, int):
153            if self.num_threads is True:
154                self.num_threads = min(multiprocessing.cpu_count(), 32)
155            elif self.num_threads is False:
156                self.num_threads = 1
157            elif self.num_threads.lower() == 'true':
158                self.num_threads = min(multiprocessing.cpu_count(), 32)
159        # The database is always a string #
160        self.search_db = str(self.search_db)
161        # Default for the output directory #
162        if self.output_dir is None:
163            self.output_dir = self.fasta + '.crest4/'
164        self.output_dir = DirectoryPath(self.output_dir)
165        # The search hits is a file somewhere if passed #
166        if self.search_hits is not None:
167            self.search_hits = FilePath(self.search_hits)
168        # Default for the search hits file if not passed #
169        if self.search_hits is None:
170            self.search_hits = FilePath(self.output_dir + 'search.hits')
171            self.search_hits.remove()
172        # Default for the minimum score #
173        if self.min_score is None:
174            if self.search_algo == 'blast':
175                self.min_score = 155
176            if self.search_algo == 'vsearch':
177                self.min_score = 0.75
178        # The minimum score has to be a number, not a string #
179        try:
180            self.min_score = float(self.min_score)
181        except (ValueError, TypeError):
182            msg = "The minimum score value must be numerical (not '%s')."
183            raise ValueError(msg % self.min_score)
184        # The score drop has to be a float, not a string #
185        try:
186            self.score_drop = float(self.score_drop)
187        except (ValueError, TypeError):
188            msg = "The score drop value must be numerical (not '%s')."
189            raise ValueError(msg % self.score_drop)
190        # Turn off the minimum similarity filter if the user passed any value #
191        if self.min_smlrty is not True: self.min_smlrty = False
192        # The OTU table is a file somewhere if passed #
193        if self.otu_table is not None:
194            self.otu_table = FilePath(self.otu_table)
195            self.otu_table.must_exist()
196
197    def validate(self):
198        """
199        This method will raise an Exception if any of the arguments passed by
200        the user are illegal.
201        """
202        # The fasta should exist if passed #
203        if self.fasta is not None:
204            self.fasta.must_exist()
205        # Either the FASTA file or the hits file has to contain something #
206        if not self.fasta and not self.search_hits:
207            msg = "Neither the FASTA file at '%s' nor the search hits file" \
208                  " at '%s' contain any data. Cannot proceed."
209            raise Exception(msg % (self.fasta, self.search_hits))
210        # Check the search algorithm #
211        if self.search_algo not in ('blast', 'vsearch'):
212            msg = "The search algorithm '%s' is not supported."
213            raise ValueError(msg % self.search_algo)
214        # The search database is a known entry or exists on the filesystem #
215        if self.search_db not in all_db_choices:
216            if not os.path.exists(self.search_db):
217                msg = "The search database '%s' is not supported."
218                raise ValueError(msg % self.search_db)
219        # Check the minimum score value is above zero #
220        if self.min_score < 0.0:
221            msg = "The minimum score cannot be smaller than zero ('%s')."
222            raise ValueError(msg % self.min_score)
223        # Check the minimum score value is below one #
224        if self.min_score > 1.0:
225            if self.search_algo == 'vsearch':
226                msg = "The minimum score cannot be more than 1.0 when" \
227                      " using VSEARCH ('%s') because it represents the" \
228                      " the minimum identity between two sequences."
229                raise ValueError(msg % self.min_score)
230        # Check the score drop value #
231        if self.score_drop < 0.0:
232            msg = "The score drop value cannot be smaller than zero ('%s')."
233            raise ValueError(msg % self.min_score)
234        if self.score_drop > 100.0:
235            msg = "The score drop value cannot be over 100 ('%s')."
236            raise ValueError(msg % self.min_score)
237
238    def __repr__(self):
239        """A simple representation of this object to avoid memory addresses."""
240        return "<%s object on '%s'>" % (self.__class__.__name__, self.fasta)
241
242    @cached_property
243    def database(self):
244        """
245        Retrieve the database object that the user has selected.
246        This can be either a standard database or a custom-specified path.
247        """
248        if self.search_db not in all_db_choices:
249            # Take the absolute path #
250            user_path = os.path.abspath(self.search_db)
251            # Make the object #
252            return CrestDatabase(custom_path=user_path)
253        else:
254            return getattr(crest4.databases, self.search_db)
255
256    #------------------------------ Searching --------------------------------#
257    @cached_property
258    def seqsearch(self):
259        """
260        An object representing the sequence similarity search.
261        Makes use of the `seqsearch` module. For reference:
262
263        * Setting `-outfmt` to 5 means XML output.
264        * Setting `-outfmt` to 6 means tabular output.
265        * Setting `-outfmt` to 7 means tabular output with comments.
266        """
267        # Initialize #
268        params = {}
269        # If the user chose BLAST then we have to specify tabular output #
270        if self.search_algo == 'blast':
271            params = {'-outfmt': '7 qseqid sseqid bitscore length nident'}
272        # In case the user chose VSEARCH we specify the minimum identity
273        # and the minimum sequence match length
274        if self.search_algo == 'vsearch':
275            params = {'--id':      self.min_score,
276                      '--mincols': 25}
277        # Build and return the object #
278        from seqsearch.search import SeqSearch
279        return SeqSearch(input_fasta = self.fasta,
280                         database    = self.database,
281                         seq_type    = 'nucl',
282                         algorithm   = self.search_algo,
283                         filtering   = {'max_targets': 100},
284                         num_threads = self.num_threads,
285                         out_path    = self.search_hits,
286                         params      = params)
287
288    def search(self):
289        """A method to launch the sequence similarity search."""
290        # Launch the search algorithm #
291        return self.seqsearch.run()
292
293    #----------------------------- Assigning ---------------------------------#
294    @cached_property
295    def score_frac(self):
296        """
297        Using the parameter `self.score_drop`, which is a percentage (e.g., 2)
298        indicating a drop, we compute the minimum remaining amount of score
299        allowed, as a fraction (e.g., 0.98).
300        """
301        return 1 - (self.score_drop / 100)
302
303    @cached_property
304    def queries(self):
305        """
306        Parses the output of the sequence search program used which returns
307        a list of `QueryResult` objects from the `SearchIO.parse` module.
308        The list contains one Query object per sequence that was originally
309        inputted. Use these objects to access the taxonomic assignments.
310        """
311        # Check if the search has been done already #
312        if not self.search_hits: self.search()
313        # Iterate on the sequence search results #
314        result = [Query(self, query) for query in self.seqsearch.results]
315        # VSEARCH entirely forgets about sequences that had no hits.
316        # Instead of still listing them in the output like BLAST.
317        # So we have to add them back to the list in this awkward manner
318        if self.search_algo == 'vsearch':
319            reported_names = set(query.name for query in result)
320            for seq in self.fasta:
321                if seq.id not in reported_names:
322                    q = type('FakeQuery', (), {'hits': [], 'id': seq.id})
323                    result.append(Query(self, q))
324        # Return #
325        return result
326
327    @cached_property
328    def queries_by_id(self):
329        """
330        References the same Query objects as the `queries` property above,
331        except that this time they are in a dictionary with the query ids
332        (i.e., the original fasta ids) as keys instead of in a list.
333        """
334        return {query.name: query for query in self.queries}
335
336    #------------------------------- Outputs ---------------------------------#
337    @cached_property
338    def out_file(self):
339        """
340        The path to the file that will contain the taxonomic assignments
341        for every sequence.
342        """
343        # Make sure that the output directory exists #
344        self.output_dir.create_if_not_exists()
345        # Return #
346        return self.output_dir + "assignments.txt"
347
348    @cached_property
349    def otu_info(self):
350        """An object giving access to the OTU table information and methods."""
351        from crest4.otu_tables import InfoFromTableOTUs
352        return InfoFromTableOTUs(self, self.otu_table)
353
354    def __call__(self):
355        """Generate outputs."""
356        # Intro message #
357        print('Running crest4 version ' + crest4.__version__)
358        # Iterate #
359        self.out_file.writelines(query.tax_string for query in self.queries)
360        # Special case where an OTU table was passed #
361        if self.otu_table:
362            path_by_rank    = self.output_dir + 'otus_by_rank.tsv'
363            path_cumulative = self.output_dir + 'otus_cumulative.tsv'
364            otus_by_rank    = self.otu_info.otus_by_rank
365            otus_cumulative = self.otu_info.otus_cumulative
366            otus_by_rank.to_csv(path_by_rank, index=False, sep='\t')
367            otus_cumulative.to_csv(path_cumulative, index=False, sep='\t')
368        # Print a success message #
369        msg = "Classification ran successfully. Results are placed in '%s'."
370        print(msg % self.out_file)
371        # Return #
372        return self.out_file

This is the main object offered by the crest4 package. It enables you to automatically assign taxonomic names to DNA sequences obtained from environmental sequencing.

After creating a new instance of a Classify object, you can simply call it to get your input data processed and the assignments output file generated. Examples are included in the README.md file of this package, which can directly be seen on the GitHub page at:

https://github.com/xapple/crest4/

Classify( fasta, search_algo='blast', num_threads=1, search_db='ssuome', output_dir=None, search_hits=None, min_score=None, score_drop=2.0, min_smlrty=True, otu_table=None)
 43    def __init__(self,
 44                 fasta,
 45                 search_algo = 'blast',
 46                 num_threads = 1,
 47                 search_db   = 'ssuome',
 48                 output_dir  = None,
 49                 search_hits = None,
 50                 min_score   = None,
 51                 score_drop  = 2.0,
 52                 min_smlrty  = True,
 53                 otu_table   = None,
 54                 ):
 55        """
 56        Args:
 57
 58            fasta: The path to a single FASTA file as a string.
 59                   These are the sequences that will be taxonomically
 60                   classified.
 61
 62            search_algo: The algorithm used for the sequence similarity search
 63                         that will be run to match the sequences against the
 64                         database chosen. Either `blast` or `vsearch`. No
 65                         other values are currently supported. By default,
 66                         `blast`.
 67
 68            num_threads: The number of processors to use for the sequence
 69                         similarity search. By default, parallelism is turned
 70                         off, and this value is 1. If you pass the value
 71                         `True,` we will run as many processes as there are
 72                         CPUs but no more than 32.
 73
 74            search_db: The database used for the sequence similarity search.
 75                       Either `ssuome`, `silvamod138pr2`, 'mitofish',
 76                       or `midori253darn`. By default, `ssuome`.
 77                       Optionally, the user can provide a custom database by
 78                       specifying the full path to a directory containing all
 79                       required files under `search_db`.
 80                       See the README for more information.
 81
 82            output_dir: The directory into which all the classification
 83                        results will be written to. This defaults to a
 84                        directory with the same name as the original FASTA
 85                        file and a `.crest4` suffix appended.
 86
 87            search_hits: The path where the search results will be stored.
 88                         This defaults to the output directory. However,
 89                         if the search operation has already been completed
 90                         beforehand, specify the path here to skip the
 91                         sequence similarity search step and go directly to
 92                         the taxonomy step. If a hits file exists in the output
 93                         directory and this option is not specified, it is
 94                         deleted and regenerated.
 95
 96            min_score: The minimum bit-score for a search hit to be considered
 97                       when using BLAST as the search algorithm. All hits below
 98                       this score are ignored. When using VSEARCH, this value
 99                       instead indicates the minimum identity between two
100                       sequences for the hit to be considered.
101                       The default is `155` for BLAST and `0.75` for VSEARCH.
102
103            score_drop: Determines the range of hits to retain and the range
104                        to discard based on a drop in percentage from the score
105                        of the best hit. Any hit below the following value:
106                        "(100 - score_drop)/100 * best_hit_score" is ignored.
107                        By default, `2.0`.
108
109            min_smlrty: Determines if the minimum similarity filter is turned
110                        on or off. Pass any value like `False` to turn it off.
111                        The minimum similarity filter prevents classification
112                        to higher ranks when a minimum rank-identity is not
113                        met. The default is `True`.
114
115            otu_table: Optionally, one can specify the path to an existing OTU
116                       table in CSV or TSV format when running `crest4`.
117                       The sequence names in the OTU table must be rows and
118                       have to match the names in the FASTA file. The column,
119                       on the other hand, provide your samples names.
120                       When this option is used, then two extra output files
121                       are generated. Firstly, a table summarizing the
122                       assignment counts per taxa. Secondly, a table
123                       propagating the sequence counts upwards
124                       in a cumulative fashion.
125                       """
126        # Save attributes #
127        self.fasta       = fasta
128        self.search_algo = search_algo
129        self.num_threads = num_threads
130        self.search_db   = search_db
131        self.output_dir  = output_dir
132        self.search_hits = search_hits
133        self.min_score   = min_score
134        self.score_drop  = score_drop
135        self.min_smlrty  = min_smlrty
136        self.otu_table   = otu_table
137        # Assign default values and change others #
138        self.transform()
139        # Validate attributes #
140        self.validate()

Args:

fasta: The path to a single FASTA file as a string.
       These are the sequences that will be taxonomically
       classified.

search_algo: The algorithm used for the sequence similarity search
             that will be run to match the sequences against the
             database chosen. Either `blast` or `vsearch`. No
             other values are currently supported. By default,
             `blast`.

num_threads: The number of processors to use for the sequence
             similarity search. By default, parallelism is turned
             off, and this value is 1. If you pass the value
             `True,` we will run as many processes as there are
             CPUs but no more than 32.

search_db: The database used for the sequence similarity search.
           Either `ssuome`, `silvamod138pr2`, 'mitofish',
           or `midori253darn`. By default, `ssuome`.
           Optionally, the user can provide a custom database by
           specifying the full path to a directory containing all
           required files under `search_db`.
           See the README for more information.

output_dir: The directory into which all the classification
            results will be written to. This defaults to a
            directory with the same name as the original FASTA
            file and a `.crest4` suffix appended.

search_hits: The path where the search results will be stored.
             This defaults to the output directory. However,
             if the search operation has already been completed
             beforehand, specify the path here to skip the
             sequence similarity search step and go directly to
             the taxonomy step. If a hits file exists in the output
             directory and this option is not specified, it is
             deleted and regenerated.

min_score: The minimum bit-score for a search hit to be considered
           when using BLAST as the search algorithm. All hits below
           this score are ignored. When using VSEARCH, this value
           instead indicates the minimum identity between two
           sequences for the hit to be considered.
           The default is `155` for BLAST and `0.75` for VSEARCH.

score_drop: Determines the range of hits to retain and the range
            to discard based on a drop in percentage from the score
            of the best hit. Any hit below the following value:
            "(100 - score_drop)/100 * best_hit_score" is ignored.
            By default, `2.0`.

min_smlrty: Determines if the minimum similarity filter is turned
            on or off. Pass any value like `False` to turn it off.
            The minimum similarity filter prevents classification
            to higher ranks when a minimum rank-identity is not
            met. The default is `True`.

otu_table: Optionally, one can specify the path to an existing OTU
           table in CSV or TSV format when running `crest4`.
           The sequence names in the OTU table must be rows and
           have to match the names in the FASTA file. The column,
           on the other hand, provide your samples names.
           When this option is used, then two extra output files
           are generated. Firstly, a table summarizing the
           assignment counts per taxa. Secondly, a table
           propagating the sequence counts upwards
           in a cumulative fashion.
fasta
search_algo
num_threads
search_db
output_dir
search_hits
min_score
score_drop
min_smlrty
otu_table
def transform(self):
142    def transform(self):
143        """
144        This method will replace empty attributes with defaults when this is
145        needed and will convert others to proper types.
146        """
147        # The fasta should be a FASTA object #
148        if self.fasta is not None:
149            from fasta import FASTA
150            self.fasta = FASTA(self.fasta)
151        # Default for the number of threads #
152        if not isinstance(self.num_threads, int):
153            if self.num_threads is True:
154                self.num_threads = min(multiprocessing.cpu_count(), 32)
155            elif self.num_threads is False:
156                self.num_threads = 1
157            elif self.num_threads.lower() == 'true':
158                self.num_threads = min(multiprocessing.cpu_count(), 32)
159        # The database is always a string #
160        self.search_db = str(self.search_db)
161        # Default for the output directory #
162        if self.output_dir is None:
163            self.output_dir = self.fasta + '.crest4/'
164        self.output_dir = DirectoryPath(self.output_dir)
165        # The search hits is a file somewhere if passed #
166        if self.search_hits is not None:
167            self.search_hits = FilePath(self.search_hits)
168        # Default for the search hits file if not passed #
169        if self.search_hits is None:
170            self.search_hits = FilePath(self.output_dir + 'search.hits')
171            self.search_hits.remove()
172        # Default for the minimum score #
173        if self.min_score is None:
174            if self.search_algo == 'blast':
175                self.min_score = 155
176            if self.search_algo == 'vsearch':
177                self.min_score = 0.75
178        # The minimum score has to be a number, not a string #
179        try:
180            self.min_score = float(self.min_score)
181        except (ValueError, TypeError):
182            msg = "The minimum score value must be numerical (not '%s')."
183            raise ValueError(msg % self.min_score)
184        # The score drop has to be a float, not a string #
185        try:
186            self.score_drop = float(self.score_drop)
187        except (ValueError, TypeError):
188            msg = "The score drop value must be numerical (not '%s')."
189            raise ValueError(msg % self.score_drop)
190        # Turn off the minimum similarity filter if the user passed any value #
191        if self.min_smlrty is not True: self.min_smlrty = False
192        # The OTU table is a file somewhere if passed #
193        if self.otu_table is not None:
194            self.otu_table = FilePath(self.otu_table)
195            self.otu_table.must_exist()

This method will replace empty attributes with defaults when this is needed and will convert others to proper types.

def validate(self):
197    def validate(self):
198        """
199        This method will raise an Exception if any of the arguments passed by
200        the user are illegal.
201        """
202        # The fasta should exist if passed #
203        if self.fasta is not None:
204            self.fasta.must_exist()
205        # Either the FASTA file or the hits file has to contain something #
206        if not self.fasta and not self.search_hits:
207            msg = "Neither the FASTA file at '%s' nor the search hits file" \
208                  " at '%s' contain any data. Cannot proceed."
209            raise Exception(msg % (self.fasta, self.search_hits))
210        # Check the search algorithm #
211        if self.search_algo not in ('blast', 'vsearch'):
212            msg = "The search algorithm '%s' is not supported."
213            raise ValueError(msg % self.search_algo)
214        # The search database is a known entry or exists on the filesystem #
215        if self.search_db not in all_db_choices:
216            if not os.path.exists(self.search_db):
217                msg = "The search database '%s' is not supported."
218                raise ValueError(msg % self.search_db)
219        # Check the minimum score value is above zero #
220        if self.min_score < 0.0:
221            msg = "The minimum score cannot be smaller than zero ('%s')."
222            raise ValueError(msg % self.min_score)
223        # Check the minimum score value is below one #
224        if self.min_score > 1.0:
225            if self.search_algo == 'vsearch':
226                msg = "The minimum score cannot be more than 1.0 when" \
227                      " using VSEARCH ('%s') because it represents the" \
228                      " the minimum identity between two sequences."
229                raise ValueError(msg % self.min_score)
230        # Check the score drop value #
231        if self.score_drop < 0.0:
232            msg = "The score drop value cannot be smaller than zero ('%s')."
233            raise ValueError(msg % self.min_score)
234        if self.score_drop > 100.0:
235            msg = "The score drop value cannot be over 100 ('%s')."
236            raise ValueError(msg % self.min_score)

This method will raise an Exception if any of the arguments passed by the user are illegal.

database
242    @cached_property
243    def database(self):
244        """
245        Retrieve the database object that the user has selected.
246        This can be either a standard database or a custom-specified path.
247        """
248        if self.search_db not in all_db_choices:
249            # Take the absolute path #
250            user_path = os.path.abspath(self.search_db)
251            # Make the object #
252            return CrestDatabase(custom_path=user_path)
253        else:
254            return getattr(crest4.databases, self.search_db)

Retrieve the database object that the user has selected. This can be either a standard database or a custom-specified path.

seqsearch
257    @cached_property
258    def seqsearch(self):
259        """
260        An object representing the sequence similarity search.
261        Makes use of the `seqsearch` module. For reference:
262
263        * Setting `-outfmt` to 5 means XML output.
264        * Setting `-outfmt` to 6 means tabular output.
265        * Setting `-outfmt` to 7 means tabular output with comments.
266        """
267        # Initialize #
268        params = {}
269        # If the user chose BLAST then we have to specify tabular output #
270        if self.search_algo == 'blast':
271            params = {'-outfmt': '7 qseqid sseqid bitscore length nident'}
272        # In case the user chose VSEARCH we specify the minimum identity
273        # and the minimum sequence match length
274        if self.search_algo == 'vsearch':
275            params = {'--id':      self.min_score,
276                      '--mincols': 25}
277        # Build and return the object #
278        from seqsearch.search import SeqSearch
279        return SeqSearch(input_fasta = self.fasta,
280                         database    = self.database,
281                         seq_type    = 'nucl',
282                         algorithm   = self.search_algo,
283                         filtering   = {'max_targets': 100},
284                         num_threads = self.num_threads,
285                         out_path    = self.search_hits,
286                         params      = params)

An object representing the sequence similarity search. Makes use of the seqsearch module. For reference:

  • Setting -outfmt to 5 means XML output.
  • Setting -outfmt to 6 means tabular output.
  • Setting -outfmt to 7 means tabular output with comments.
def search(self):
288    def search(self):
289        """A method to launch the sequence similarity search."""
290        # Launch the search algorithm #
291        return self.seqsearch.run()

A method to launch the sequence similarity search.

score_frac
294    @cached_property
295    def score_frac(self):
296        """
297        Using the parameter `self.score_drop`, which is a percentage (e.g., 2)
298        indicating a drop, we compute the minimum remaining amount of score
299        allowed, as a fraction (e.g., 0.98).
300        """
301        return 1 - (self.score_drop / 100)

Using the parameter self.score_drop, which is a percentage (e.g., 2) indicating a drop, we compute the minimum remaining amount of score allowed, as a fraction (e.g., 0.98).

queries
303    @cached_property
304    def queries(self):
305        """
306        Parses the output of the sequence search program used which returns
307        a list of `QueryResult` objects from the `SearchIO.parse` module.
308        The list contains one Query object per sequence that was originally
309        inputted. Use these objects to access the taxonomic assignments.
310        """
311        # Check if the search has been done already #
312        if not self.search_hits: self.search()
313        # Iterate on the sequence search results #
314        result = [Query(self, query) for query in self.seqsearch.results]
315        # VSEARCH entirely forgets about sequences that had no hits.
316        # Instead of still listing them in the output like BLAST.
317        # So we have to add them back to the list in this awkward manner
318        if self.search_algo == 'vsearch':
319            reported_names = set(query.name for query in result)
320            for seq in self.fasta:
321                if seq.id not in reported_names:
322                    q = type('FakeQuery', (), {'hits': [], 'id': seq.id})
323                    result.append(Query(self, q))
324        # Return #
325        return result

Parses the output of the sequence search program used which returns a list of QueryResult objects from the SearchIO.parse module. The list contains one Query object per sequence that was originally inputted. Use these objects to access the taxonomic assignments.

queries_by_id
327    @cached_property
328    def queries_by_id(self):
329        """
330        References the same Query objects as the `queries` property above,
331        except that this time they are in a dictionary with the query ids
332        (i.e., the original fasta ids) as keys instead of in a list.
333        """
334        return {query.name: query for query in self.queries}

References the same Query objects as the queries property above, except that this time they are in a dictionary with the query ids (i.e., the original fasta ids) as keys instead of in a list.

out_file
337    @cached_property
338    def out_file(self):
339        """
340        The path to the file that will contain the taxonomic assignments
341        for every sequence.
342        """
343        # Make sure that the output directory exists #
344        self.output_dir.create_if_not_exists()
345        # Return #
346        return self.output_dir + "assignments.txt"

The path to the file that will contain the taxonomic assignments for every sequence.

otu_info
348    @cached_property
349    def otu_info(self):
350        """An object giving access to the OTU table information and methods."""
351        from crest4.otu_tables import InfoFromTableOTUs
352        return InfoFromTableOTUs(self, self.otu_table)

An object giving access to the OTU table information and methods.