fasta.fastq

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import os, sys, shutil, platform
 12
 13# Internal modules #
 14from fasta import FASTA
 15
 16# First party modules #
 17from plumbing.common     import average
 18from plumbing.cache      import property_cached
 19from autopaths.tmp_path  import new_temp_path
 20from autopaths.file_path import FilePath
 21
 22# Third party modules #
 23if platform.system() == 'Windows': import pbs3 as sh
 24else: import sh
 25
 26################################################################################
 27class FASTQ(FASTA):
 28    """
 29    A single FASTQ file somewhere in the filesystem.
 30    It is very similar to a FASTA file object, except we have a few added
 31    methods and properties for dealing with the quality scores.
 32    """
 33
 34    ext    = 'fastq'
 35    format = 'fastq'
 36
 37    #----------------------------- Properties --------------------------------#
 38    @property_cached
 39    def count(self):
 40        # Import module #
 41        from shell_command import shell_output
 42        # Case when we are not compressed #
 43        if not self.gzipped:
 44            return int(int(shell_output("cat %s | wc -l" % self.path)) / 4)
 45        # If we are gzipped we can just use zcat or gzcat on macOS #
 46        program = 'gzcat' if sys.platform != 'linux' else 'zcat'
 47        command = "%s %s | wc -l" % (program, self.path)
 48        return int(int(shell_output(command)) / 4)
 49
 50    @property_cached
 51    def avg_quality(self):
 52        """
 53        Returns a single float.
 54        Quite slow and inefficient computationally.
 55        Think of a better implementation?
 56        """
 57        mean = average(s for r in self for s in
 58                       r.letter_annotations["phred_quality"])
 59        return mean
 60
 61    #----------------------------- Conversion --------------------------------#
 62    def to_fasta(self, path, verbose=False):
 63        from Bio import SeqIO
 64        # Select verbosity #
 65        import tqdm
 66        wrapper = tqdm.tqdm if verbose else lambda x: x
 67        # Do it #
 68        with open(path, 'w') as handle:
 69            for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
 70        # Return #
 71        return FASTA(path)
 72
 73    def to_qual(self, path, verbose=False):
 74        from Bio import SeqIO
 75        # Select verbosity #
 76        import tqdm
 77        wrapper = tqdm.tqdm if verbose else lambda x: x
 78        # Do it #
 79        with open(path, 'w') as handle:
 80            for r in wrapper(self): SeqIO.write(r, handle, 'qual')
 81        # Return #
 82        return FilePath(path)
 83
 84    #-------------------------------- Tools ----------------------------------#
 85    @property_cached
 86    def validator(self):
 87        """Validate the format of this FASTQ."""
 88        from fasta.validator import Validator
 89        return Validator(self.path)
 90
 91    @property_cached
 92    def fastqc(self):
 93        """Run the FastQC software on this FASTQ."""
 94        from fasta.fastqc import FastQC
 95        return FastQC(self)
 96
 97    #-------------------------------- PHRED Format ---------------------------#
 98    def guess_phred_format(self):
 99        """
100        Guess the PHRED score format. The gold standard is the first one, aka
101        the one called "Sanger". Sanger encoding is exactly equivalent to
102        Illumina-1.8 encoding.
103        In other words, they finally gave up with their alternative standards.
104        """
105        # Possibilities #
106        self.intervals = {
107            'Sanger':       (33,  74),    # <- This one is the gold standard
108            'Solexa':       (59, 105),    # <- This one is rare
109            'Illumina-1.3': (64, 105),    # <- These were abandoned after they wised up.
110            'Illumina-1.5': (67, 105),    # <- These were abandoned after they wised up.
111        }
112        # Initialize variables #
113        glob_min  = 9999
114        glob_max  = 0
115        take_next = False
116        # Stop after #
117        max_sequences_to_consider = 2000000
118        # Error message #
119        msg = "Multiple PHRED encodings possible for '%s'.\n"
120        # Loop #
121        self.open()
122        for i, line in enumerate(self.handle):
123            # Are we on the PHRED line? #
124            if not take_next:
125                take_next = line.startswith('+')
126                continue
127            # Now we are on the PHRED line! #
128            take_next = False
129            current_min, current_max = self.get_qual_range(line.rstrip('\n'))
130            if current_min < glob_min or current_max > glob_max:
131                glob_min = min(current_min, glob_min)
132                glob_max = max(current_max, glob_max)
133                valid_encodings = [e for e, r in self.intervals.items()
134                                   if glob_min >= r[0] and glob_max <= r[1]]
135                if len(valid_encodings) == 0:
136                    msg = "Illegal PHRED encoding for '%s'.\n"
137                    break
138                if len(valid_encodings) == 1:
139                    return valid_encodings[0]
140            # Stop condition #
141            if i >= max_sequences_to_consider: break
142        # It didn't work #
143        msg += "Maximum detected value is: %s.\n"
144        msg += "Minimum detected value is: %s.\n"
145        msg += "Possibilities are: %s."
146        msg  = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings))
147        raise Exception(msg)
148
149    def get_qual_range(self, phred_string):
150        """
151        >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
152        (68, 89)
153        """
154        values = [ord(char) for char in phred_string]
155        return min(values), max(values)
156
157    #------------------------- Conversion methods ----------------------------#
158    def phred_13_to_18(self, new_path=None, in_place=True):
159        """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
160        # New file #
161        if new_path is None:
162            new_fastq = self.__class__(new_temp_path(suffix=self.extension))
163        else:
164            new_fastq = self.__class__(new_path)
165        # Do it #
166        self.format = 'fastq-illumina'
167        new_fastq.open('w')
168        new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
169        new_fastq.close()
170        self.format = 'fastq-sanger'
171        # Return #
172        if in_place:
173            os.remove(self.path)
174            shutil.move(new_fastq, self.path)
175            return self
176        else: return new_fastq
177
178    def phred_13_to_18_sed(self, new_path=None, in_place=True):
179        """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
180        # String #
181        sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
182        # Faster with bash utilities #
183        if in_place is True:
184            sh.sed('-i', sed_command, self.path)
185            return self
186        # New file #
187        if new_path is None: new_fastq = self.__class__(new_temp_path())
188        else:                new_fastq = self.__class__(new_path)
189        sh.sed(sed_command + " " + new_fastq, self.path)
190        return new_fastq
class FASTQ(fasta.core.FASTA):
 28class FASTQ(FASTA):
 29    """
 30    A single FASTQ file somewhere in the filesystem.
 31    It is very similar to a FASTA file object, except we have a few added
 32    methods and properties for dealing with the quality scores.
 33    """
 34
 35    ext    = 'fastq'
 36    format = 'fastq'
 37
 38    #----------------------------- Properties --------------------------------#
 39    @property_cached
 40    def count(self):
 41        # Import module #
 42        from shell_command import shell_output
 43        # Case when we are not compressed #
 44        if not self.gzipped:
 45            return int(int(shell_output("cat %s | wc -l" % self.path)) / 4)
 46        # If we are gzipped we can just use zcat or gzcat on macOS #
 47        program = 'gzcat' if sys.platform != 'linux' else 'zcat'
 48        command = "%s %s | wc -l" % (program, self.path)
 49        return int(int(shell_output(command)) / 4)
 50
 51    @property_cached
 52    def avg_quality(self):
 53        """
 54        Returns a single float.
 55        Quite slow and inefficient computationally.
 56        Think of a better implementation?
 57        """
 58        mean = average(s for r in self for s in
 59                       r.letter_annotations["phred_quality"])
 60        return mean
 61
 62    #----------------------------- Conversion --------------------------------#
 63    def to_fasta(self, path, verbose=False):
 64        from Bio import SeqIO
 65        # Select verbosity #
 66        import tqdm
 67        wrapper = tqdm.tqdm if verbose else lambda x: x
 68        # Do it #
 69        with open(path, 'w') as handle:
 70            for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
 71        # Return #
 72        return FASTA(path)
 73
 74    def to_qual(self, path, verbose=False):
 75        from Bio import SeqIO
 76        # Select verbosity #
 77        import tqdm
 78        wrapper = tqdm.tqdm if verbose else lambda x: x
 79        # Do it #
 80        with open(path, 'w') as handle:
 81            for r in wrapper(self): SeqIO.write(r, handle, 'qual')
 82        # Return #
 83        return FilePath(path)
 84
 85    #-------------------------------- Tools ----------------------------------#
 86    @property_cached
 87    def validator(self):
 88        """Validate the format of this FASTQ."""
 89        from fasta.validator import Validator
 90        return Validator(self.path)
 91
 92    @property_cached
 93    def fastqc(self):
 94        """Run the FastQC software on this FASTQ."""
 95        from fasta.fastqc import FastQC
 96        return FastQC(self)
 97
 98    #-------------------------------- PHRED Format ---------------------------#
 99    def guess_phred_format(self):
100        """
101        Guess the PHRED score format. The gold standard is the first one, aka
102        the one called "Sanger". Sanger encoding is exactly equivalent to
103        Illumina-1.8 encoding.
104        In other words, they finally gave up with their alternative standards.
105        """
106        # Possibilities #
107        self.intervals = {
108            'Sanger':       (33,  74),    # <- This one is the gold standard
109            'Solexa':       (59, 105),    # <- This one is rare
110            'Illumina-1.3': (64, 105),    # <- These were abandoned after they wised up.
111            'Illumina-1.5': (67, 105),    # <- These were abandoned after they wised up.
112        }
113        # Initialize variables #
114        glob_min  = 9999
115        glob_max  = 0
116        take_next = False
117        # Stop after #
118        max_sequences_to_consider = 2000000
119        # Error message #
120        msg = "Multiple PHRED encodings possible for '%s'.\n"
121        # Loop #
122        self.open()
123        for i, line in enumerate(self.handle):
124            # Are we on the PHRED line? #
125            if not take_next:
126                take_next = line.startswith('+')
127                continue
128            # Now we are on the PHRED line! #
129            take_next = False
130            current_min, current_max = self.get_qual_range(line.rstrip('\n'))
131            if current_min < glob_min or current_max > glob_max:
132                glob_min = min(current_min, glob_min)
133                glob_max = max(current_max, glob_max)
134                valid_encodings = [e for e, r in self.intervals.items()
135                                   if glob_min >= r[0] and glob_max <= r[1]]
136                if len(valid_encodings) == 0:
137                    msg = "Illegal PHRED encoding for '%s'.\n"
138                    break
139                if len(valid_encodings) == 1:
140                    return valid_encodings[0]
141            # Stop condition #
142            if i >= max_sequences_to_consider: break
143        # It didn't work #
144        msg += "Maximum detected value is: %s.\n"
145        msg += "Minimum detected value is: %s.\n"
146        msg += "Possibilities are: %s."
147        msg  = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings))
148        raise Exception(msg)
149
150    def get_qual_range(self, phred_string):
151        """
152        >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
153        (68, 89)
154        """
155        values = [ord(char) for char in phred_string]
156        return min(values), max(values)
157
158    #------------------------- Conversion methods ----------------------------#
159    def phred_13_to_18(self, new_path=None, in_place=True):
160        """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
161        # New file #
162        if new_path is None:
163            new_fastq = self.__class__(new_temp_path(suffix=self.extension))
164        else:
165            new_fastq = self.__class__(new_path)
166        # Do it #
167        self.format = 'fastq-illumina'
168        new_fastq.open('w')
169        new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
170        new_fastq.close()
171        self.format = 'fastq-sanger'
172        # Return #
173        if in_place:
174            os.remove(self.path)
175            shutil.move(new_fastq, self.path)
176            return self
177        else: return new_fastq
178
179    def phred_13_to_18_sed(self, new_path=None, in_place=True):
180        """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
181        # String #
182        sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
183        # Faster with bash utilities #
184        if in_place is True:
185            sh.sed('-i', sed_command, self.path)
186            return self
187        # New file #
188        if new_path is None: new_fastq = self.__class__(new_temp_path())
189        else:                new_fastq = self.__class__(new_path)
190        sh.sed(sed_command + " " + new_fastq, self.path)
191        return new_fastq

A single FASTQ file somewhere in the filesystem. It is very similar to a FASTA file object, except we have a few added methods and properties for dealing with the quality scores.

ext = 'fastq'
format = 'fastq'
count

Should probably check for file size changes instead of just caching once TODO.

avg_quality

Returns a single float. Quite slow and inefficient computationally. Think of a better implementation?

def to_fasta(self, path, verbose=False):
63    def to_fasta(self, path, verbose=False):
64        from Bio import SeqIO
65        # Select verbosity #
66        import tqdm
67        wrapper = tqdm.tqdm if verbose else lambda x: x
68        # Do it #
69        with open(path, 'w') as handle:
70            for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
71        # Return #
72        return FASTA(path)
def to_qual(self, path, verbose=False):
74    def to_qual(self, path, verbose=False):
75        from Bio import SeqIO
76        # Select verbosity #
77        import tqdm
78        wrapper = tqdm.tqdm if verbose else lambda x: x
79        # Do it #
80        with open(path, 'w') as handle:
81            for r in wrapper(self): SeqIO.write(r, handle, 'qual')
82        # Return #
83        return FilePath(path)
validator

Validate the format of this FASTQ.

fastqc

Run the FastQC software on this FASTQ.

def guess_phred_format(self):
 99    def guess_phred_format(self):
100        """
101        Guess the PHRED score format. The gold standard is the first one, aka
102        the one called "Sanger". Sanger encoding is exactly equivalent to
103        Illumina-1.8 encoding.
104        In other words, they finally gave up with their alternative standards.
105        """
106        # Possibilities #
107        self.intervals = {
108            'Sanger':       (33,  74),    # <- This one is the gold standard
109            'Solexa':       (59, 105),    # <- This one is rare
110            'Illumina-1.3': (64, 105),    # <- These were abandoned after they wised up.
111            'Illumina-1.5': (67, 105),    # <- These were abandoned after they wised up.
112        }
113        # Initialize variables #
114        glob_min  = 9999
115        glob_max  = 0
116        take_next = False
117        # Stop after #
118        max_sequences_to_consider = 2000000
119        # Error message #
120        msg = "Multiple PHRED encodings possible for '%s'.\n"
121        # Loop #
122        self.open()
123        for i, line in enumerate(self.handle):
124            # Are we on the PHRED line? #
125            if not take_next:
126                take_next = line.startswith('+')
127                continue
128            # Now we are on the PHRED line! #
129            take_next = False
130            current_min, current_max = self.get_qual_range(line.rstrip('\n'))
131            if current_min < glob_min or current_max > glob_max:
132                glob_min = min(current_min, glob_min)
133                glob_max = max(current_max, glob_max)
134                valid_encodings = [e for e, r in self.intervals.items()
135                                   if glob_min >= r[0] and glob_max <= r[1]]
136                if len(valid_encodings) == 0:
137                    msg = "Illegal PHRED encoding for '%s'.\n"
138                    break
139                if len(valid_encodings) == 1:
140                    return valid_encodings[0]
141            # Stop condition #
142            if i >= max_sequences_to_consider: break
143        # It didn't work #
144        msg += "Maximum detected value is: %s.\n"
145        msg += "Minimum detected value is: %s.\n"
146        msg += "Possibilities are: %s."
147        msg  = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings))
148        raise Exception(msg)

Guess the PHRED score format. The gold standard is the first one, aka the one called "Sanger". Sanger encoding is exactly equivalent to Illumina-1.8 encoding. In other words, they finally gave up with their alternative standards.

def get_qual_range(self, phred_string):
150    def get_qual_range(self, phred_string):
151        """
152        >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
153        (68, 89)
154        """
155        values = [ord(char) for char in phred_string]
156        return min(values), max(values)
>>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
(68, 89)
def phred_13_to_18(self, new_path=None, in_place=True):
159    def phred_13_to_18(self, new_path=None, in_place=True):
160        """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython."""
161        # New file #
162        if new_path is None:
163            new_fastq = self.__class__(new_temp_path(suffix=self.extension))
164        else:
165            new_fastq = self.__class__(new_path)
166        # Do it #
167        self.format = 'fastq-illumina'
168        new_fastq.open('w')
169        new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self)
170        new_fastq.close()
171        self.format = 'fastq-sanger'
172        # Return #
173        if in_place:
174            os.remove(self.path)
175            shutil.move(new_fastq, self.path)
176            return self
177        else: return new_fastq

Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.

def phred_13_to_18_sed(self, new_path=None, in_place=True):
179    def phred_13_to_18_sed(self, new_path=None, in_place=True):
180        """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster)."""
181        # String #
182        sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/"""
183        # Faster with bash utilities #
184        if in_place is True:
185            sh.sed('-i', sed_command, self.path)
186            return self
187        # New file #
188        if new_path is None: new_fastq = self.__class__(new_temp_path())
189        else:                new_fastq = self.__class__(new_path)
190        sh.sed(sed_command + " " + new_fastq, self.path)
191        return new_fastq

Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).

Inherited Members
autopaths.base_path.BasePath
BasePath
clean_path
path
short_prefix
extension
escaped
absolute_path
physical_path
relative_path
rel_path_from
exists
permissions
mdate
mdate_iso
cdate
cdate_iso
unix_style
wsl_style
win_style
with_tilda
with_home
fasta.core.FASTA
buffer_size
gzipped
first
lengths
lengths_counter
open
close
parse
progress
create
add
add_seq
add_str
add_fasta
add_fastas
flush
write
compress
compress_slow
compress_fast
ids
get_id
sequences
sql
length_by_id
subsample
rename_with_num
rename_with_prefix
rename_sequences
extract_length
extract_sequences
remove_trailing_stars
remove_duplicates
convert_U_to_T
align
template_align
index_bowtie
index_samtools
graphs
parse_primers
autopaths.file_path.FilePath
prefix_path
prefix
name
filename
directory
count_bytes
size
contents
contents_utf8
md5
might_be_binary
contains_binary
magic_number
lines
read
touch
writelines
remove
copy
execute
replace_extension
new_name_insert
make_directory
must_exist
head
pretty_head
tail
pretty_tail
move_to
rename
gzip_to
gzip_internal
gzip_external
gzip_pigz
ungzip_to
ungzip_internal
ungzip_external
zip_to
unzip_to
untar_to
untargz_to
untargz_to_internal
untargz_to_external
append
prepend
remove_line
remove_first_line
replace_line
replace_word
sed_replace
builtins.str
encode
replace
split
rsplit
join
capitalize
casefold
title
center
expandtabs
find
partition
index
ljust
lower
lstrip
rfind
rindex
rjust
rstrip
rpartition
splitlines
strip
swapcase
translate
upper
startswith
endswith
removeprefix
removesuffix
isascii
islower
isupper
istitle
isspace
isdecimal
isdigit
isnumeric
isalpha
isalnum
isidentifier
isprintable
zfill
format_map
maketrans