fasta.fastq

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

View Source

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import os, sys, shutil, platform
 12
 13# Internal modules #
 14from fasta import FASTA
 15
 16# First party modules #
 17from plumbing.common     import average
 18from plumbing.cache      import property_cached
 19from autopaths.tmp_path  import new_temp_path
 20from autopaths.file_path import FilePath
 21
 22# Third party modules #
 23if platform.system() == 'Windows': import pbs3 as sh
 24else: import sh
 25
 26################################################################################
 27class FASTQ(FASTA):
 28    """
 29    A single FASTQ file somewhere in the filesystem.
 30    It is very similar to a FASTA file object, except we have a few added
 31    methods and properties for dealing with the quality scores.
 32    """
 33
 34    ext    = 'fastq'
 35    format = 'fastq'
 36
 37    #----------------------------- Properties --------------------------------#
 38    @property_cached
 39    def count(self):
 40        # Import module #
 41        from shell_command import shell_output
 42        # Case when we are not compressed #
 43        if not self.gzipped:
 44            return int(int(shell_output("cat %s | wc -l" % self.path)) / 4)
 45        # If we are gzipped we can just use zcat or gzcat on macOS #
 46        program = 'gzcat' if sys.platform != 'linux' else 'zcat'
 47        command = "%s %s | wc -l" % (program, self.path)
 48        return int(int(shell_output(command)) / 4)
 49
 50    @property_cached
 51    def avg_quality(self):
 52        """
 53        Returns a single float.
 54        Quite slow and inefficient computationally.
 55        Think of a better implementation?
 56        """
 57        mean = average(s for r in self for s in
 58                       r.letter_annotations["phred_quality"])
 59        return mean
 60
 61    #----------------------------- Conversion --------------------------------#
 62    def to_fasta(self, path, verbose=False):
 63        from Bio import SeqIO
 64        # Select verbosity #
 65        import tqdm
 66        wrapper = tqdm.tqdm if verbose else lambda x: x
 67        # Do it #
 68        with open(path, 'w') as handle:
 69            for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
 70        # Return #
 71        return FASTA(path)
 72
 73    def to_qual(self, path, verbose=False):
 74        from Bio import SeqIO
 75        # Select verbosity #
 76        import tqdm
 77        wrapper = tqdm.tqdm if verbose else lambda x: x
 78        # Do it #
 79        with open(path, 'w') as handle:
 80            for r in wrapper(self): SeqIO.write(r, handle, 'qual')
 81        # Return #
 82        return FilePath(path)
 83
 84    #-------------------------------- Tools ----------------------------------#
 85    @property_cached
 86    def validator(self):
 87        """Validate the format of this FASTQ."""
 88        from fasta.validator import Validator
 89        return Validator(self.path)
 90
 91    @property_cached
 92    def fastqc(self):
 93        """Run the FastQC software on this FASTQ."""
 94        from fasta.fastqc import FastQC
 95        return FastQC(self)
 96
 97    #-------------------------------- PHRED Format ---------------------------#
 98    def guess_phred_format(self):
 99        """
100        Guess the PHRED score format. The gold standard is the first one, aka
101        the one called "Sanger". Sanger encoding is exactly equivalent to
102        Illumina-1.8 encoding.
103        In other words, they finally gave up with their alternative standards.
104        """
105        # Possibilities #
106        self.intervals = {
107            'Sanger':       (33,  74),    # <- This one is the gold standard
108            'Solexa':       (59, 105),    # <- This one is rare
109            'Illumina-1.3': (64, 105),    # <- These were abandoned after they wised up.
110            'Illumina-1.5': (67, 105),    # <- These were abandoned after they wised up.
111        }
112        # Initialize variables #
113        glob_min  = 9999
114        glob_max  = 0
115        take_next = False
116        # Stop after #
117        max_sequences_to_consider = 2000000
118        # Error message #
119        msg = "Multiple PHRED encodings possible for '%s'.\n"
120        # Loop #
121        self.open()
122        for i, line in enumerate(self.handle):
123            # Are we on the PHRED line? #
124            if not take_next:
125                take_next = line.startswith('+')
126                continue
127            # Now we are on the PHRED line! #
128            take_next = False
129            current_min, current_max = self.get_qual_range(line.rstrip('\n'))
130            if current_min < glob_min or current_max > glob_max:
131                glob_min = min(current_min, glob_min)
132                glob_max = max(current_max, glob_max)
133                valid_encodings = [e for e, r in self.intervals.items()
134                                   if glob_min >= r[0] and glob_max <= r[1]]
135                if len(valid_encodings) == 0:
136                    msg = "Illegal PHRED encoding for '%s'.\n"
137                    break
138                if len(valid_encodings) == 1:
139                    return valid_encodings[0]
140            # Stop condition #
141            if i >= max_sequences_to_consider: break
142        # It didn't work #
143        msg += "Maximum detected value is: %s.\n"
144        msg += "Minimum detected value is: %s.\n"
145        msg += "Possibilities are: %s."
146        msg  = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings))
147        raise Exception(msg)
148
149    def get_qual_range(self, phred_string):
150        """
151        >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
152        (68, 89)
153        """
154        values = [ord(char) for char in phred_string]
155        return min(values), max(values)
156
157    #------------------------- Conversion methods ----------------------------#
158    def phred_13_to_18(self, new_path=None, in_place=True):
159        """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
160        # New file #
161        if new_path is None:
162            new_fastq = self.__class__(new_temp_path(suffix=self.extension))
163        else:
164            new_fastq = self.__class__(new_path)
165        # Do it #
166        self.format = 'fastq-illumina'
167        new_fastq.open('w')
168        new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
169        new_fastq.close()
170        self.format = 'fastq-sanger'
171        # Return #
172        if in_place:
173            os.remove(self.path)
174            shutil.move(new_fastq, self.path)
175            return self
176        else: return new_fastq
177
178    def phred_13_to_18_sed(self, new_path=None, in_place=True):
179        """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
180        # String #
181        sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
182        # Faster with bash utilities #
183        if in_place is True:
184            sh.sed('-i', sed_command, self.path)
185            return self
186        # New file #
187        if new_path is None: new_fastq = self.__class__(new_temp_path())
188        else:                new_fastq = self.__class__(new_path)
189        sh.sed(sed_command + " " + new_fastq, self.path)
190        return new_fastq

class FASTQ(fasta.core.FASTA): View Source

 28class FASTQ(FASTA):
 29    """
 30    A single FASTQ file somewhere in the filesystem.
 31    It is very similar to a FASTA file object, except we have a few added
 32    methods and properties for dealing with the quality scores.
 33    """
 34
 35    ext    = 'fastq'
 36    format = 'fastq'
 37
 38    #----------------------------- Properties --------------------------------#
 39    @property_cached
 40    def count(self):
 41        # Import module #
 42        from shell_command import shell_output
 43        # Case when we are not compressed #
 44        if not self.gzipped:
 45            return int(int(shell_output("cat %s | wc -l" % self.path)) / 4)
 46        # If we are gzipped we can just use zcat or gzcat on macOS #
 47        program = 'gzcat' if sys.platform != 'linux' else 'zcat'
 48        command = "%s %s | wc -l" % (program, self.path)
 49        return int(int(shell_output(command)) / 4)
 50
 51    @property_cached
 52    def avg_quality(self):
 53        """
 54        Returns a single float.
 55        Quite slow and inefficient computationally.
 56        Think of a better implementation?
 57        """
 58        mean = average(s for r in self for s in
 59                       r.letter_annotations["phred_quality"])
 60        return mean
 61
 62    #----------------------------- Conversion --------------------------------#
 63    def to_fasta(self, path, verbose=False):
 64        from Bio import SeqIO
 65        # Select verbosity #
 66        import tqdm
 67        wrapper = tqdm.tqdm if verbose else lambda x: x
 68        # Do it #
 69        with open(path, 'w') as handle:
 70            for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
 71        # Return #
 72        return FASTA(path)
 73
 74    def to_qual(self, path, verbose=False):
 75        from Bio import SeqIO
 76        # Select verbosity #
 77        import tqdm
 78        wrapper = tqdm.tqdm if verbose else lambda x: x
 79        # Do it #
 80        with open(path, 'w') as handle:
 81            for r in wrapper(self): SeqIO.write(r, handle, 'qual')
 82        # Return #
 83        return FilePath(path)
 84
 85    #-------------------------------- Tools ----------------------------------#
 86    @property_cached
 87    def validator(self):
 88        """Validate the format of this FASTQ."""
 89        from fasta.validator import Validator
 90        return Validator(self.path)
 91
 92    @property_cached
 93    def fastqc(self):
 94        """Run the FastQC software on this FASTQ."""
 95        from fasta.fastqc import FastQC
 96        return FastQC(self)
 97
 98    #-------------------------------- PHRED Format ---------------------------#
 99    def guess_phred_format(self):
100        """
101        Guess the PHRED score format. The gold standard is the first one, aka
102        the one called "Sanger". Sanger encoding is exactly equivalent to
103        Illumina-1.8 encoding.
104        In other words, they finally gave up with their alternative standards.
105        """
106        # Possibilities #
107        self.intervals = {
108            'Sanger':       (33,  74),    # <- This one is the gold standard
109            'Solexa':       (59, 105),    # <- This one is rare
110            'Illumina-1.3': (64, 105),    # <- These were abandoned after they wised up.
111            'Illumina-1.5': (67, 105),    # <- These were abandoned after they wised up.
112        }
113        # Initialize variables #
114        glob_min  = 9999
115        glob_max  = 0
116        take_next = False
117        # Stop after #
118        max_sequences_to_consider = 2000000
119        # Error message #
120        msg = "Multiple PHRED encodings possible for '%s'.\n"
121        # Loop #
122        self.open()
123        for i, line in enumerate(self.handle):
124            # Are we on the PHRED line? #
125            if not take_next:
126                take_next = line.startswith('+')
127                continue
128            # Now we are on the PHRED line! #
129            take_next = False
130            current_min, current_max = self.get_qual_range(line.rstrip('\n'))
131            if current_min < glob_min or current_max > glob_max:
132                glob_min = min(current_min, glob_min)
133                glob_max = max(current_max, glob_max)
134                valid_encodings = [e for e, r in self.intervals.items()
135                                   if glob_min >= r[0] and glob_max <= r[1]]
136                if len(valid_encodings) == 0:
137                    msg = "Illegal PHRED encoding for '%s'.\n"
138                    break
139                if len(valid_encodings) == 1:
140                    return valid_encodings[0]
141            # Stop condition #
142            if i >= max_sequences_to_consider: break
143        # It didn't work #
144        msg += "Maximum detected value is: %s.\n"
145        msg += "Minimum detected value is: %s.\n"
146        msg += "Possibilities are: %s."
147        msg  = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings))
148        raise Exception(msg)
149
150    def get_qual_range(self, phred_string):
151        """
152        >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
153        (68, 89)
154        """
155        values = [ord(char) for char in phred_string]
156        return min(values), max(values)
157
158    #------------------------- Conversion methods ----------------------------#
159    def phred_13_to_18(self, new_path=None, in_place=True):
160        """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
161        # New file #
162        if new_path is None:
163            new_fastq = self.__class__(new_temp_path(suffix=self.extension))
164        else:
165            new_fastq = self.__class__(new_path)
166        # Do it #
167        self.format = 'fastq-illumina'
168        new_fastq.open('w')
169        new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
170        new_fastq.close()
171        self.format = 'fastq-sanger'
172        # Return #
173        if in_place:
174            os.remove(self.path)
175            shutil.move(new_fastq, self.path)
176            return self
177        else: return new_fastq
178
179    def phred_13_to_18_sed(self, new_path=None, in_place=True):
180        """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
181        # String #
182        sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
183        # Faster with bash utilities #
184        if in_place is True:
185            sh.sed('-i', sed_command, self.path)
186            return self
187        # New file #
188        if new_path is None: new_fastq = self.__class__(new_temp_path())
189        else:                new_fastq = self.__class__(new_path)
190        sh.sed(sed_command + " " + new_fastq, self.path)
191        return new_fastq

A single FASTQ file somewhere in the filesystem. It is very similar to a FASTA file object, except we have a few added methods and properties for dealing with the quality scores.

ext = 'fastq'

format = 'fastq'

count

Should probably check for file size changes instead of just caching once TODO.

avg_quality

Returns a single float. Quite slow and inefficient computationally. Think of a better implementation?

def to_fasta(self, path, verbose=False): View Source

63    def to_fasta(self, path, verbose=False):
64        from Bio import SeqIO
65        # Select verbosity #
66        import tqdm
67        wrapper = tqdm.tqdm if verbose else lambda x: x
68        # Do it #
69        with open(path, 'w') as handle:
70            for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
71        # Return #
72        return FASTA(path)

def to_qual(self, path, verbose=False): View Source

74    def to_qual(self, path, verbose=False):
75        from Bio import SeqIO
76        # Select verbosity #
77        import tqdm
78        wrapper = tqdm.tqdm if verbose else lambda x: x
79        # Do it #
80        with open(path, 'w') as handle:
81            for r in wrapper(self): SeqIO.write(r, handle, 'qual')
82        # Return #
83        return FilePath(path)

validator

Validate the format of this FASTQ.

fastqc

Run the FastQC software on this FASTQ.

def guess_phred_format(self): View Source

 99    def guess_phred_format(self):
100        """
101        Guess the PHRED score format. The gold standard is the first one, aka
102        the one called "Sanger". Sanger encoding is exactly equivalent to
103        Illumina-1.8 encoding.
104        In other words, they finally gave up with their alternative standards.
105        """
106        # Possibilities #
107        self.intervals = {
108            'Sanger':       (33,  74),    # <- This one is the gold standard
109            'Solexa':       (59, 105),    # <- This one is rare
110            'Illumina-1.3': (64, 105),    # <- These were abandoned after they wised up.
111            'Illumina-1.5': (67, 105),    # <- These were abandoned after they wised up.
112        }
113        # Initialize variables #
114        glob_min  = 9999
115        glob_max  = 0
116        take_next = False
117        # Stop after #
118        max_sequences_to_consider = 2000000
119        # Error message #
120        msg = "Multiple PHRED encodings possible for '%s'.\n"
121        # Loop #
122        self.open()
123        for i, line in enumerate(self.handle):
124            # Are we on the PHRED line? #
125            if not take_next:
126                take_next = line.startswith('+')
127                continue
128            # Now we are on the PHRED line! #
129            take_next = False
130            current_min, current_max = self.get_qual_range(line.rstrip('\n'))
131            if current_min < glob_min or current_max > glob_max:
132                glob_min = min(current_min, glob_min)
133                glob_max = max(current_max, glob_max)
134                valid_encodings = [e for e, r in self.intervals.items()
135                                   if glob_min >= r[0] and glob_max <= r[1]]
136                if len(valid_encodings) == 0:
137                    msg = "Illegal PHRED encoding for '%s'.\n"
138                    break
139                if len(valid_encodings) == 1:
140                    return valid_encodings[0]
141            # Stop condition #
142            if i >= max_sequences_to_consider: break
143        # It didn't work #
144        msg += "Maximum detected value is: %s.\n"
145        msg += "Minimum detected value is: %s.\n"
146        msg += "Possibilities are: %s."
147        msg  = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings))
148        raise Exception(msg)

Guess the PHRED score format. The gold standard is the first one, aka the one called "Sanger". Sanger encoding is exactly equivalent to Illumina-1.8 encoding. In other words, they finally gave up with their alternative standards.

def get_qual_range(self, phred_string): View Source

150    def get_qual_range(self, phred_string):
151        """
152        >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
153        (68, 89)
154        """
155        values = [ord(char) for char in phred_string]
156        return min(values), max(values)

>>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
(68, 89)

def phred_13_to_18(self, new_path=None, in_place=True): View Source

159    def phred_13_to_18(self, new_path=None, in_place=True):
160        """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
161        # New file #
162        if new_path is None:
163            new_fastq = self.__class__(new_temp_path(suffix=self.extension))
164        else:
165            new_fastq = self.__class__(new_path)
166        # Do it #
167        self.format = 'fastq-illumina'
168        new_fastq.open('w')
169        new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
170        new_fastq.close()
171        self.format = 'fastq-sanger'
172        # Return #
173        if in_place:
174            os.remove(self.path)
175            shutil.move(new_fastq, self.path)
176            return self
177        else: return new_fastq

Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.

def phred_13_to_18_sed(self, new_path=None, in_place=True): View Source

179    def phred_13_to_18_sed(self, new_path=None, in_place=True):
180        """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
181        # String #
182        sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
183        # Faster with bash utilities #
184        if in_place is True:
185            sh.sed('-i', sed_command, self.path)
186            return self
187        # New file #
188        if new_path is None: new_fastq = self.__class__(new_temp_path())
189        else:                new_fastq = self.__class__(new_path)
190        sh.sed(sed_command + " " + new_fastq, self.path)
191        return new_fastq

Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).

Inherited Members

autopaths.base_path.BasePath: BasePath; clean_path; path; short_prefix; extension; escaped; absolute_path; physical_path; relative_path; rel_path_from; exists; is_symlink; permissions; mdate; mdate_iso; cdate; cdate_iso; unix_style; wsl_style; win_style; with_tilda; with_home; link_from; link_to; symlinks_on_linux; symlinks_on_windows; hard_link_win_to
fasta.core.FASTA: buffer_size; gzipped; first; lengths; lengths_counter; open; close; parse; progress; create; add; add_seq; add_str; add_fasta; add_fastas; flush; write; compress; compress_slow; compress_fast; ids; get_id; sequences; sql; length_by_id; subsample; rename_with_num; rename_with_prefix; rename_sequences; extract_length; extract_sequences; remove_trailing_stars; remove_duplicates; convert_U_to_T; align; template_align; index_bowtie; index_samtools; graphs; parse_primers
autopaths.file_path.FilePath: prefix_path; prefix; name; filename; directory; count_bytes; size; contents; contents_utf8; md5; might_be_binary; contains_binary; magic_number; lines; read; touch; writelines; remove; copy; execute; replace_extension; new_name_insert; make_directory; must_exist; head; pretty_head; tail; pretty_tail; move_to; rename; gzip_to; gzip_internal; gzip_external; gzip_pigz; ungzip_to; ungzip_internal; ungzip_external; zip_to; unzip_to; untar_to; untargz_to; untargz_to_internal; untargz_to_external; append; prepend; remove_line; remove_first_line; replace_line; replace_word; sed_replace
builtins.str: encode; replace; split; rsplit; join; capitalize; casefold; title; center; expandtabs; find; partition; index; ljust; lower; lstrip; rfind; rindex; rjust; rstrip; rpartition; splitlines; strip; swapcase; translate; upper; startswith; endswith; removeprefix; removesuffix; isascii; islower; isupper; istitle; isspace; isdecimal; isdigit; isnumeric; isalpha; isalnum; isidentifier; isprintable; zfill; format_map; maketrans