fasta.core

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

View Source

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import os, sys, io, gzip, shutil, itertools, platform
 12from collections import Counter, OrderedDict
 13from six import string_types
 14
 15# Internal modules #
 16from fasta import graphs
 17
 18# First party modules #
 19from plumbing.common     import isubsample
 20from plumbing.color      import Color
 21from plumbing.cache      import property_cached
 22from autopaths.file_path import FilePath
 23from autopaths.tmp_path  import new_temp_path
 24
 25# Third party modules #
 26from tqdm import tqdm
 27if platform.system() == 'Windows': import pbs3 as sh
 28else: import sh
 29
 30# Constants #
 31class Dummy: pass
 32
 33###############################################################################
 34class FASTA(FilePath):
 35    """
 36    A single FASTA file somewhere in the filesystem. You can read from it in
 37    several convenient ways. You can write to it in a automatically buffered
 38    way. There are several other things you can do with a FASTA file.
 39    """
 40
 41    format      = 'fasta'
 42    ext         = 'fasta'
 43    buffer_size = 1000
 44
 45    def __len__(self): return self.count
 46
 47    def __repr__(self):
 48        return '<%s object on "%s">' % (self.__class__.__name__, self.path)
 49
 50    def __contains__(self, other): return other in self.ids
 51
 52    def __enter__(self): return self.create()
 53
 54    def __exit__(self, exc_type, exc_value, traceback): self.close()
 55
 56    def __iter__(self):
 57        for seq in self.parse(): yield seq
 58        self.close()
 59
 60    def __getitem__(self, key):
 61        if   isinstance(key, string_types): return self.sequences[key]
 62        elif isinstance(key, int):          return self.sequences.items()[key]
 63        elif isinstance(key, slice):
 64            return itertools.islice(self, key.start, key.stop, key.step)
 65
 66    #----------------------------- Properties --------------------------------#
 67    @property
 68    def gzipped(self): return True if self.path.endswith('gz') else False
 69
 70    @property
 71    def first(self):
 72        """Just the first sequence."""
 73        from Bio import SeqIO
 74        self.open()
 75        seq = next(SeqIO.parse(self.handle, self.format))
 76        self.close()
 77        return seq
 78
 79    @property_cached
 80    def count(self):
 81        """
 82        Should probably check for file size changes instead of just
 83        caching once TODO.
 84        """
 85        # For debugging purposes #
 86        if False: print("-> counting reads in `%s`" % self.path)
 87        # If we are gzipped we can just use zgrep #
 88        if self.gzipped:
 89            return int(sh.zgrep('-c', "^>", self.path, _ok_code=[0,1]))
 90        else:
 91            return int(sh.grep('-c', "^>", self.path, _ok_code=[0,1]))
 92
 93    @property
 94    def lengths(self):
 95        """All the lengths, one by one, in a list."""
 96        return map(len, self)
 97
 98    @property_cached
 99    def lengths_counter(self):
100        """A Counter() object with all the lengths inside."""
101        return Counter((len(s) for s in self.parse()))
102
103    #-------------------------- Basic IO methods -----------------------------#
104    def open(self, mode='r'):
105        # Two cases #
106        if self.gzipped:
107            self.handle = gzip.open(self.path, mode)
108            self.handle = io.TextIOWrapper(self.handle, encoding='utf8')
109        else:
110            self.handle = open(self.path, mode)
111        # For convenience #
112        return self.handle
113
114    def close(self):
115        # Case we were writing to the file #
116        if hasattr(self, 'buffer'):
117            self.flush()
118            del self.buffer
119        # Standard case #
120        self.handle.close()
121        # For pickling purposes (can't use dill on gzip handles) #
122        del self.handle
123
124    def parse(self):
125        self.open()
126        from Bio import SeqIO
127        return SeqIO.parse(self.handle, self.format)
128
129    @property
130    def progress(self):
131        """Just like self.parse() but will display a progress bar."""
132        return tqdm(self, total=len(self))
133
134    def create(self):
135        """Create the file on the file system."""
136        self.buffer = []
137        self.buf_count = 0
138        if not self.directory.exists: self.directory.create()
139        self.open('w')
140        return self
141
142    def add(self, seqs):
143        """Use this method to add a bunch of SeqRecords at once."""
144        for seq in seqs: self.add_seq(seq)
145
146    def add_seq(self, seq):
147        """Use this method to add a SeqRecord object to this fasta."""
148        self.buffer.append(seq)
149        self.buf_count += 1
150        if self.buf_count % self.buffer_size == 0: self.flush()
151
152    def add_str(self, seq, name=None, description=""):
153        """Use this method to add a sequence as a string to this fasta."""
154        from Bio.SeqRecord import SeqRecord
155        from Bio.Seq import Seq
156        self.add_seq(SeqRecord(Seq(seq), id=name, description=description))
157
158    def add_fasta(self, path):
159        """Use this method to add an other fasta to this fasta."""
160        path = FASTA(path)
161        self.add(path)
162
163    def add_fastas(self, paths):
164        """Use this method to add a bunch of fastas to this fasta."""
165        for p in paths: self.add_fasta(p)
166
167    def flush(self):
168        """Empty the buffer."""
169        from Bio import SeqIO
170        for seq in self.buffer:
171            SeqIO.write(seq, self.handle, self.format)
172        self.buffer = []
173
174    def write(self, reads):
175        from Bio import SeqIO
176        if not self.directory.exists: self.directory.create()
177        self.open('w')
178        SeqIO.write(reads, self.handle, self.format)
179        self.close()
180        return self
181
182    #-------------------------- Compressing the data -------------------------#
183    def compress(self, new_path=None, remove_orig=False, method='slow'):
184        """Turn this FASTA file into a gzipped FASTA file."""
185        # Check we are not compressed already #
186        if self.gzipped:
187            msg = "The fasta file '%s' is already compressed."
188            raise Exception(msg % self.path)
189        # Pick the new path #
190        if new_path is None: new_path = self.path + '.gz'
191        # Do it the fast way or the slow way #
192        if method == 'fast': self.compress_fast(new_path)
193        else:                self.compress_slow(new_path)
194        # Optionally remove the original uncompressed file #
195        if remove_orig: self.remove()
196        # Update the internal path #
197        self.path = new_path
198        # Return #
199        return self.path
200
201    def compress_slow(self, new_path):
202        """Do the compression internally via python."""
203        with gzip.open(new_path, 'wb') as handle:
204            shutil.copyfileobj(self.open('rb'), handle)
205
206    def compress_fast(self, new_path):
207        """Do the compression with an external shell command call."""
208        # We don't want python to be buffering the text for speed #
209        from shell_command import shell_output
210        cmd = 'gzip --stdout %s > %s' % (self.path, new_path)
211        return shell_output(cmd)
212
213    #------------------------- When IDs are important ------------------------#
214    @property_cached
215    def ids(self):
216        """A frozen set of all unique IDs in the file."""
217        as_list = [seq.description.split()[0] for seq in self]
218        as_set = frozenset(as_list)
219        assert len(as_set) == len(as_list)
220        return as_set
221
222    def get_id(self, id_num):
223        """
224        Extract one sequence from the file based on its ID.
225        This is highly ineffective.
226        Consider using the SQLite API instead or memory map the file.
227        """
228        for seq in self:
229            if seq.id == id_num: return seq
230
231    @property_cached
232    def sequences(self):
233        """
234        Another way of easily retrieving sequences. Also highly ineffective.
235        Consider using the SQLite API instead.
236        """
237        return OrderedDict(((seq.id, seq) for seq in self))
238
239    @property_cached
240    def sql(self):
241        """
242        If you access this attribute, we will build an SQLite database
243        out of the FASTA file and you will be able access everything in an
244        indexed fashion, and use the blaze library via sql.frame
245        """
246        from fasta.indexed import DatabaseFASTA, fasta_to_sql
247        db = DatabaseFASTA(self.prefix_path + ".db")
248        if not db.exists: fasta_to_sql(self.path, db.path)
249        return db
250
251    @property_cached
252    def length_by_id(self):
253        """
254        In some use cases you just need the sequence lengths in an indexed
255        fashion. If you access this attribute, we will make a hash map in
256        memory.
257        """
258        hash_map = dict((seq.id, len(seq)) for seq in self)
259        tmp = hash_map.copy()
260        hash_map.update(tmp)
261        return hash_map
262
263    #----------------- Ways of interacting with the data --------------------#
264    def subsample(self, down_to=1, new_path=None, verbose=True):
265        """Pick a given number of sequences from the file pseudo-randomly."""
266        # Pick the destination path #
267        if new_path is None:
268            subsampled = self.__class__(new_temp_path())
269        elif isinstance(new_path, FASTA):
270            subsampled = new_path
271        else:
272            subsampled = self.__class__(new_path)
273        # Check size #
274        if down_to > len(self):
275            message = "Can't subsample %s down to %i. Only down to %i."
276            print(Color.ylw + message % (self, down_to, len(self)) + Color.end)
277            self.copy(new_path)
278            return
279        # Select verbosity #
280        import tqdm
281        if verbose: wrapper = lambda x: tqdm.tqdm(x, total=self.count)
282        else: wrapper = lambda x: x
283        # Generator #
284        def iterator():
285            for read in wrapper(isubsample(self, down_to)):
286                yield read
287        # Do it #
288        subsampled.write(iterator())
289        # Did it work #
290        assert len(subsampled) == down_to
291        # Return #
292        return subsampled
293
294    def rename_with_num(self, prefix="", new_path=None, remove_desc=True):
295        """Rename every sequence based on a prefix and a number."""
296        # Temporary path #
297        if new_path is None: numbered = self.__class__(new_temp_path())
298        else:                numbered = self.__class__(new_path)
299        # Generator #
300        def numbered_iterator():
301            for i,read in enumerate(self):
302                read.id  = prefix + str(i)
303                read.seq = read.seq.upper()
304                if remove_desc: read.description = ""
305                yield read
306        # Do it #
307        numbered.write(numbered_iterator())
308        # Replace it #
309        if new_path is None:
310            os.remove(self.path)
311            shutil.move(numbered, self.path)
312        # Return #
313        return numbered
314
315    def rename_with_prefix(self, prefix="", new_path=None, in_place=True,
316                           remove_desc=True):
317        """Rename every sequence based on a prefix."""
318        # Temporary path #
319        if new_path is None: prefixed = self.__class__(new_temp_path())
320        else:                prefixed = self.__class__(new_path)
321        # Generator #
322        def prefixed_iterator():
323            for i,read in enumerate(self):
324                read.id = prefix + read.id
325                if remove_desc: read.description = ""
326                yield read
327        # Do it #
328        prefixed.write(prefixed_iterator())
329        # Replace it #
330        if in_place:
331            os.remove(self.path)
332            shutil.move(prefixed, self.path)
333        # Return #
334        return prefixed
335
336    def rename_sequences(self, mapping, new_path=None, in_place=False):
337        """
338        Will rename all sequences in the current fasta file using
339        the mapping dictionary also provided. In place or at a new path.
340        """
341        # Where is the new file #
342        if new_path is None: new_fasta = self.__class__(new_temp_path())
343        else:                new_fasta = self.__class__(new_path)
344        # Do it #
345        new_fasta.create()
346        for seq in self:
347            new_name = mapping[seq.description]
348            nucleotides = str(seq.seq)
349            new_fasta.add_str(nucleotides, new_name)
350        new_fasta.close()
351        # Return #
352        if in_place:
353            os.remove(self.path)
354            shutil.move(new_fasta, self.path)
355            return self
356        else: return new_fasta
357
358    def extract_length(self, lower_bound=None,
359                             upper_bound=None,
360                             new_path=None):
361        """Extract a certain length fraction and place them in a new file."""
362        # Temporary path #
363        if new_path is None: fraction = self.__class__(new_temp_path())
364        elif isinstance(new_path, FASTA): fraction = new_path
365        else:                fraction = self.__class__(new_path)
366        # Generator #
367        if lower_bound is None: lower_bound = 0
368        if upper_bound is None: upper_bound = sys.maxsize
369        def fraction_iterator():
370            for read in self:
371                if lower_bound <= len(read) <= upper_bound:
372                    yield read
373        # Do it #
374        fraction.write(fraction_iterator())
375        # Return #
376        return fraction
377
378    def extract_sequences(self, ids,
379                          new_path = None,
380                          in_place = False,
381                          verbose  = False):
382        """
383        Will take all the sequences from the current file who's id appears in
384        the ids given and place them in a new file.
385        If no path is given, a new temporary path is created and returned.
386        If `in_place` is set to True, the original file is removed and replaced
387        with the result of the extraction.
388        Optionally, the argument `ids` can be a function which has to take
389        one string as only input and return True for keeping the sequence and
390        False for discarding the sequence.
391        """
392        # Temporary path #
393        if new_path is None: new_fasta = self.__class__(new_temp_path())
394        elif isinstance(new_path, FASTA): new_fasta = new_path
395        else:                new_fasta = self.__class__(new_path)
396        # Select verbosity #
397        import tqdm
398        wrapper = tqdm.tqdm if verbose else lambda x: x
399        # Simple generator #
400        def simple_match(reads):
401            for r in wrapper(reads):
402                if r.id in ids: yield r
403        # Generator with function #
404        def function_match(reads):
405            for r in wrapper(reads):
406                if ids(r.id): yield r
407        # Do it #
408        if callable(ids):
409            new_fasta.write(function_match(self))
410        else:
411            new_fasta.write(simple_match(self))
412        # Return #
413        if in_place:
414            os.remove(self.path)
415            shutil.move(new_fasta, self.path)
416            return self
417        else: return new_fasta
418
419    def remove_trailing_stars(self, new_path=None, in_place=True, check=False):
420        """
421        Remove the bad character that can be inserted by some programs at the
422        end of sequences.
423        """
424        # Optional check #
425        if check and int(sh.grep('-c', '\\*', self.path, _ok_code=[0,1])) == 0:
426            return self
427        # Faster with bash utilities #
428        if in_place is True:
429            sh.sed('-i', 's/\\*$//g', self.path)
430            return self
431        # Standard way #
432        if new_path is None: new_fasta = self.__class__(new_temp_path())
433        else:                new_fasta = self.__class__(new_path)
434        new_fasta.create()
435        for seq in self: new_fasta.add_str(str(seq.seq).rstrip('*'), seq.id)
436        new_fasta.close()
437        # Return #
438        return new_fasta
439
440    def _generator_mod(self, generator, new_path=None, in_place=True):
441        """
442        Generic way of modifying the current fasta either in place or
443        with a new destination pass.
444        Simply, pass a generator function that will yield the new sequences
445        given the current ones.
446        """
447        # Temporary path #
448        if new_path is None: new_fasta = self.__class__(new_temp_path())
449        elif isinstance(new_path, FASTA): new_fasta = new_path
450        else: new_fasta = self.__class__(new_path)
451        # Do it #
452        new_fasta.write(generator())
453        # Return #
454        if in_place:
455            os.remove(self.path)
456            shutil.move(new_fasta, self.path)
457            return self
458        else: return new_fasta
459
460    def remove_duplicates(self, new_path=None, in_place=True):
461        """
462        If several entries have the same ID in the FASTA file, keep only the
463        first appearance and remove all the others.
464        """
465        # Generator #
466        def unique_entries():
467            seen = set()
468            for i, read in enumerate(self):
469                if read.id in seen: continue
470                else:
471                    seen.add(read.id)
472                    yield read
473        # Return #
474        return self._generator_mod(unique_entries, new_path, in_place)
475
476    def convert_U_to_T(self, new_path=None, in_place=True):
477        # Generator #
478        def all_U_to_T():
479            for i, read in enumerate(self):
480                read.seq = read.seq.back_transcribe()
481                yield read
482        # Return #
483        return self._generator_mod(all_U_to_T, new_path, in_place)
484
485    #---------------------------- Third party programs -----------------------#
486    def align(self, out_path=None):
487        """We align the sequences in the fasta file with muscle."""
488        if out_path is None: out_path = self.prefix_path + '.aln'
489        sh.muscle38("-in", self.path, "-out", out_path)
490        from fasta.aligned import AlignedFASTA
491        return AlignedFASTA(out_path)
492
493    def template_align(self, ref_path):
494        """
495        We align the sequences in the fasta file with mothur and a template.
496        """
497        # Run it #
498        msg = "#align.seqs(candidate=%s, template=%s, search=blast," \
499              "flip=false, processors=8);"
500        sh.mothur(msg % (self.path, ref_path))
501        # Move things #
502        shutil.move(self.path[:-6] + '.align',        self.p.aligned)
503        shutil.move(self.path[:-6] + '.align.report', self.p.report)
504        shutil.move(self.path[:-6] + '.flip.accnos',  self.p.accnos)
505        # Clean up #
506        if os.path.exists('formatdb.log'):
507            os.remove('formatdb.log')
508        if os.path.exists('error.log') and os.path.getsize('error.log') == 0:
509            os.remove('error.log')
510        for path in sh.glob('mothur.*.logfile'):
511            os.remove(path)
512        # Return #
513        return self.p.aligned
514
515    def index_bowtie(self):
516        """Create an index on the fasta file compatible with bowtie2."""
517        # It returns exit code 1 if the fasta is empty #
518        assert self
519        # Call the bowtie executable #
520        sh.bowtie2_build(self.path, self.path)
521        return FilePath(self.path + '.1.bt2')
522
523    def index_samtools(self):
524        """Create an index on the fasta file compatible with samtools."""
525        sh.samtools('faidx', self.path)
526        return FilePath(self.path + '.fai')
527
528    #--------------------------------- Graphs --------------------------------#
529    @property_cached
530    def graphs(self):
531        """
532        Sorry for the black magic. The result is an object whose attributes
533        are all the graphs found in `./graphs.py` initialized with this
534        instance as only argument.
535        """
536        # Make a dummy object #
537        result = Dummy()
538        # Loop over graphs #
539        for graph in graphs.__all__:
540            cls = getattr(graphs, graph)
541            setattr(result, cls.short_name, cls(self))
542        # Return #
543        return result
544
545    #-------------------------------- Primers -------------------------------#
546    def parse_primers(self, primers, mismatches=None):
547        """
548        Takes care of identifying primers inside every sequence.
549        Instead of yielding Seq objects now we yield ReadWithPrimers objects.
550        These have extra properties that show the start and end positions
551        of all primers found.
552        """
553        # Default is zero #
554        if mismatches is None: mismatches = 0
555        # Get the search expressions with mismatches #
556        from fasta.primers import PrimersRegexes
557        regexes = PrimersRegexes(primers, mismatches)
558        # Generate a new special object for every read #
559        from fasta.primers import ReadWithPrimers
560        read_with_primer = lambda read: ReadWithPrimers(read, regexes)
561        generator = (read_with_primer(r) for r in self.parse())
562        # Add the length to the generator #
563        from plumbing.common import GenWithLength
564        generator = GenWithLength(generator, len(self))
565        # Return #
566        return generator

class Dummy: View Source

32class Dummy: pass

fasta.core

Inherited Members