fasta.fastq
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11import os, sys, shutil, platform 12 13# Internal modules # 14from fasta import FASTA 15 16# First party modules # 17from plumbing.common import average 18from plumbing.cache import property_cached 19from autopaths.tmp_path import new_temp_path 20from autopaths.file_path import FilePath 21 22# Third party modules # 23if platform.system() == 'Windows': import pbs3 as sh 24else: import sh 25 26################################################################################ 27class FASTQ(FASTA): 28 """ 29 A single FASTQ file somewhere in the filesystem. 30 It is very similar to a FASTA file object, except we have a few added 31 methods and properties for dealing with the quality scores. 32 """ 33 34 ext = 'fastq' 35 format = 'fastq' 36 37 #----------------------------- Properties --------------------------------# 38 @property_cached 39 def count(self): 40 # Import module # 41 from shell_command import shell_output 42 # Case when we are not compressed # 43 if not self.gzipped: 44 return int(int(shell_output("cat %s | wc -l" % self.path)) / 4) 45 # If we are gzipped we can just use zcat or gzcat on macOS # 46 program = 'gzcat' if sys.platform != 'linux' else 'zcat' 47 command = "%s %s | wc -l" % (program, self.path) 48 return int(int(shell_output(command)) / 4) 49 50 @property_cached 51 def avg_quality(self): 52 """ 53 Returns a single float. 54 Quite slow and inefficient computationally. 55 Think of a better implementation? 56 """ 57 mean = average(s for r in self for s in 58 r.letter_annotations["phred_quality"]) 59 return mean 60 61 #----------------------------- Conversion --------------------------------# 62 def to_fasta(self, path, verbose=False): 63 from Bio import SeqIO 64 # Select verbosity # 65 import tqdm 66 wrapper = tqdm.tqdm if verbose else lambda x: x 67 # Do it # 68 with open(path, 'w') as handle: 69 for r in wrapper(self): SeqIO.write(r, handle, 'fasta') 70 # Return # 71 return FASTA(path) 72 73 def to_qual(self, path, verbose=False): 74 from Bio import SeqIO 75 # Select verbosity # 76 import tqdm 77 wrapper = tqdm.tqdm if verbose else lambda x: x 78 # Do it # 79 with open(path, 'w') as handle: 80 for r in wrapper(self): SeqIO.write(r, handle, 'qual') 81 # Return # 82 return FilePath(path) 83 84 #-------------------------------- Tools ----------------------------------# 85 @property_cached 86 def validator(self): 87 """Validate the format of this FASTQ.""" 88 from fasta.validator import Validator 89 return Validator(self.path) 90 91 @property_cached 92 def fastqc(self): 93 """Run the FastQC software on this FASTQ.""" 94 from fasta.fastqc import FastQC 95 return FastQC(self) 96 97 #-------------------------------- PHRED Format ---------------------------# 98 def guess_phred_format(self): 99 """ 100 Guess the PHRED score format. The gold standard is the first one, aka 101 the one called "Sanger". Sanger encoding is exactly equivalent to 102 Illumina-1.8 encoding. 103 In other words, they finally gave up with their alternative standards. 104 """ 105 # Possibilities # 106 self.intervals = { 107 'Sanger': (33, 74), # <- This one is the gold standard 108 'Solexa': (59, 105), # <- This one is rare 109 'Illumina-1.3': (64, 105), # <- These were abandoned after they wised up. 110 'Illumina-1.5': (67, 105), # <- These were abandoned after they wised up. 111 } 112 # Initialize variables # 113 glob_min = 9999 114 glob_max = 0 115 take_next = False 116 # Stop after # 117 max_sequences_to_consider = 2000000 118 # Error message # 119 msg = "Multiple PHRED encodings possible for '%s'.\n" 120 # Loop # 121 self.open() 122 for i, line in enumerate(self.handle): 123 # Are we on the PHRED line? # 124 if not take_next: 125 take_next = line.startswith('+') 126 continue 127 # Now we are on the PHRED line! # 128 take_next = False 129 current_min, current_max = self.get_qual_range(line.rstrip('\n')) 130 if current_min < glob_min or current_max > glob_max: 131 glob_min = min(current_min, glob_min) 132 glob_max = max(current_max, glob_max) 133 valid_encodings = [e for e, r in self.intervals.items() 134 if glob_min >= r[0] and glob_max <= r[1]] 135 if len(valid_encodings) == 0: 136 msg = "Illegal PHRED encoding for '%s'.\n" 137 break 138 if len(valid_encodings) == 1: 139 return valid_encodings[0] 140 # Stop condition # 141 if i >= max_sequences_to_consider: break 142 # It didn't work # 143 msg += "Maximum detected value is: %s.\n" 144 msg += "Minimum detected value is: %s.\n" 145 msg += "Possibilities are: %s." 146 msg = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings)) 147 raise Exception(msg) 148 149 def get_qual_range(self, phred_string): 150 """ 151 >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV") 152 (68, 89) 153 """ 154 values = [ord(char) for char in phred_string] 155 return min(values), max(values) 156 157 #------------------------- Conversion methods ----------------------------# 158 def phred_13_to_18(self, new_path=None, in_place=True): 159 """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.""" 160 # New file # 161 if new_path is None: 162 new_fastq = self.__class__(new_temp_path(suffix=self.extension)) 163 else: 164 new_fastq = self.__class__(new_path) 165 # Do it # 166 self.format = 'fastq-illumina' 167 new_fastq.open('w') 168 new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self) 169 new_fastq.close() 170 self.format = 'fastq-sanger' 171 # Return # 172 if in_place: 173 os.remove(self.path) 174 shutil.move(new_fastq, self.path) 175 return self 176 else: return new_fastq 177 178 def phred_13_to_18_sed(self, new_path=None, in_place=True): 179 """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).""" 180 # String # 181 sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/""" 182 # Faster with bash utilities # 183 if in_place is True: 184 sh.sed('-i', sed_command, self.path) 185 return self 186 # New file # 187 if new_path is None: new_fastq = self.__class__(new_temp_path()) 188 else: new_fastq = self.__class__(new_path) 189 sh.sed(sed_command + " " + new_fastq, self.path) 190 return new_fastq
28class FASTQ(FASTA): 29 """ 30 A single FASTQ file somewhere in the filesystem. 31 It is very similar to a FASTA file object, except we have a few added 32 methods and properties for dealing with the quality scores. 33 """ 34 35 ext = 'fastq' 36 format = 'fastq' 37 38 #----------------------------- Properties --------------------------------# 39 @property_cached 40 def count(self): 41 # Import module # 42 from shell_command import shell_output 43 # Case when we are not compressed # 44 if not self.gzipped: 45 return int(int(shell_output("cat %s | wc -l" % self.path)) / 4) 46 # If we are gzipped we can just use zcat or gzcat on macOS # 47 program = 'gzcat' if sys.platform != 'linux' else 'zcat' 48 command = "%s %s | wc -l" % (program, self.path) 49 return int(int(shell_output(command)) / 4) 50 51 @property_cached 52 def avg_quality(self): 53 """ 54 Returns a single float. 55 Quite slow and inefficient computationally. 56 Think of a better implementation? 57 """ 58 mean = average(s for r in self for s in 59 r.letter_annotations["phred_quality"]) 60 return mean 61 62 #----------------------------- Conversion --------------------------------# 63 def to_fasta(self, path, verbose=False): 64 from Bio import SeqIO 65 # Select verbosity # 66 import tqdm 67 wrapper = tqdm.tqdm if verbose else lambda x: x 68 # Do it # 69 with open(path, 'w') as handle: 70 for r in wrapper(self): SeqIO.write(r, handle, 'fasta') 71 # Return # 72 return FASTA(path) 73 74 def to_qual(self, path, verbose=False): 75 from Bio import SeqIO 76 # Select verbosity # 77 import tqdm 78 wrapper = tqdm.tqdm if verbose else lambda x: x 79 # Do it # 80 with open(path, 'w') as handle: 81 for r in wrapper(self): SeqIO.write(r, handle, 'qual') 82 # Return # 83 return FilePath(path) 84 85 #-------------------------------- Tools ----------------------------------# 86 @property_cached 87 def validator(self): 88 """Validate the format of this FASTQ.""" 89 from fasta.validator import Validator 90 return Validator(self.path) 91 92 @property_cached 93 def fastqc(self): 94 """Run the FastQC software on this FASTQ.""" 95 from fasta.fastqc import FastQC 96 return FastQC(self) 97 98 #-------------------------------- PHRED Format ---------------------------# 99 def guess_phred_format(self): 100 """ 101 Guess the PHRED score format. The gold standard is the first one, aka 102 the one called "Sanger". Sanger encoding is exactly equivalent to 103 Illumina-1.8 encoding. 104 In other words, they finally gave up with their alternative standards. 105 """ 106 # Possibilities # 107 self.intervals = { 108 'Sanger': (33, 74), # <- This one is the gold standard 109 'Solexa': (59, 105), # <- This one is rare 110 'Illumina-1.3': (64, 105), # <- These were abandoned after they wised up. 111 'Illumina-1.5': (67, 105), # <- These were abandoned after they wised up. 112 } 113 # Initialize variables # 114 glob_min = 9999 115 glob_max = 0 116 take_next = False 117 # Stop after # 118 max_sequences_to_consider = 2000000 119 # Error message # 120 msg = "Multiple PHRED encodings possible for '%s'.\n" 121 # Loop # 122 self.open() 123 for i, line in enumerate(self.handle): 124 # Are we on the PHRED line? # 125 if not take_next: 126 take_next = line.startswith('+') 127 continue 128 # Now we are on the PHRED line! # 129 take_next = False 130 current_min, current_max = self.get_qual_range(line.rstrip('\n')) 131 if current_min < glob_min or current_max > glob_max: 132 glob_min = min(current_min, glob_min) 133 glob_max = max(current_max, glob_max) 134 valid_encodings = [e for e, r in self.intervals.items() 135 if glob_min >= r[0] and glob_max <= r[1]] 136 if len(valid_encodings) == 0: 137 msg = "Illegal PHRED encoding for '%s'.\n" 138 break 139 if len(valid_encodings) == 1: 140 return valid_encodings[0] 141 # Stop condition # 142 if i >= max_sequences_to_consider: break 143 # It didn't work # 144 msg += "Maximum detected value is: %s.\n" 145 msg += "Minimum detected value is: %s.\n" 146 msg += "Possibilities are: %s." 147 msg = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings)) 148 raise Exception(msg) 149 150 def get_qual_range(self, phred_string): 151 """ 152 >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV") 153 (68, 89) 154 """ 155 values = [ord(char) for char in phred_string] 156 return min(values), max(values) 157 158 #------------------------- Conversion methods ----------------------------# 159 def phred_13_to_18(self, new_path=None, in_place=True): 160 """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.""" 161 # New file # 162 if new_path is None: 163 new_fastq = self.__class__(new_temp_path(suffix=self.extension)) 164 else: 165 new_fastq = self.__class__(new_path) 166 # Do it # 167 self.format = 'fastq-illumina' 168 new_fastq.open('w') 169 new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self) 170 new_fastq.close() 171 self.format = 'fastq-sanger' 172 # Return # 173 if in_place: 174 os.remove(self.path) 175 shutil.move(new_fastq, self.path) 176 return self 177 else: return new_fastq 178 179 def phred_13_to_18_sed(self, new_path=None, in_place=True): 180 """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).""" 181 # String # 182 sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/""" 183 # Faster with bash utilities # 184 if in_place is True: 185 sh.sed('-i', sed_command, self.path) 186 return self 187 # New file # 188 if new_path is None: new_fastq = self.__class__(new_temp_path()) 189 else: new_fastq = self.__class__(new_path) 190 sh.sed(sed_command + " " + new_fastq, self.path) 191 return new_fastq
A single FASTQ file somewhere in the filesystem. It is very similar to a FASTA file object, except we have a few added methods and properties for dealing with the quality scores.
avg_quality
Returns a single float. Quite slow and inefficient computationally. Think of a better implementation?
def
to_fasta(self, path, verbose=False):
63 def to_fasta(self, path, verbose=False): 64 from Bio import SeqIO 65 # Select verbosity # 66 import tqdm 67 wrapper = tqdm.tqdm if verbose else lambda x: x 68 # Do it # 69 with open(path, 'w') as handle: 70 for r in wrapper(self): SeqIO.write(r, handle, 'fasta') 71 # Return # 72 return FASTA(path)
def
to_qual(self, path, verbose=False):
74 def to_qual(self, path, verbose=False): 75 from Bio import SeqIO 76 # Select verbosity # 77 import tqdm 78 wrapper = tqdm.tqdm if verbose else lambda x: x 79 # Do it # 80 with open(path, 'w') as handle: 81 for r in wrapper(self): SeqIO.write(r, handle, 'qual') 82 # Return # 83 return FilePath(path)
def
guess_phred_format(self):
99 def guess_phred_format(self): 100 """ 101 Guess the PHRED score format. The gold standard is the first one, aka 102 the one called "Sanger". Sanger encoding is exactly equivalent to 103 Illumina-1.8 encoding. 104 In other words, they finally gave up with their alternative standards. 105 """ 106 # Possibilities # 107 self.intervals = { 108 'Sanger': (33, 74), # <- This one is the gold standard 109 'Solexa': (59, 105), # <- This one is rare 110 'Illumina-1.3': (64, 105), # <- These were abandoned after they wised up. 111 'Illumina-1.5': (67, 105), # <- These were abandoned after they wised up. 112 } 113 # Initialize variables # 114 glob_min = 9999 115 glob_max = 0 116 take_next = False 117 # Stop after # 118 max_sequences_to_consider = 2000000 119 # Error message # 120 msg = "Multiple PHRED encodings possible for '%s'.\n" 121 # Loop # 122 self.open() 123 for i, line in enumerate(self.handle): 124 # Are we on the PHRED line? # 125 if not take_next: 126 take_next = line.startswith('+') 127 continue 128 # Now we are on the PHRED line! # 129 take_next = False 130 current_min, current_max = self.get_qual_range(line.rstrip('\n')) 131 if current_min < glob_min or current_max > glob_max: 132 glob_min = min(current_min, glob_min) 133 glob_max = max(current_max, glob_max) 134 valid_encodings = [e for e, r in self.intervals.items() 135 if glob_min >= r[0] and glob_max <= r[1]] 136 if len(valid_encodings) == 0: 137 msg = "Illegal PHRED encoding for '%s'.\n" 138 break 139 if len(valid_encodings) == 1: 140 return valid_encodings[0] 141 # Stop condition # 142 if i >= max_sequences_to_consider: break 143 # It didn't work # 144 msg += "Maximum detected value is: %s.\n" 145 msg += "Minimum detected value is: %s.\n" 146 msg += "Possibilities are: %s." 147 msg = msg % (self, glob_max, glob_min, ' or '.join(valid_encodings)) 148 raise Exception(msg)
Guess the PHRED score format. The gold standard is the first one, aka the one called "Sanger". Sanger encoding is exactly equivalent to Illumina-1.8 encoding. In other words, they finally gave up with their alternative standards.
def
get_qual_range(self, phred_string):
150 def get_qual_range(self, phred_string): 151 """ 152 >>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV") 153 (68, 89) 154 """ 155 values = [ord(char) for char in phred_string] 156 return min(values), max(values)
>>> self.get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
(68, 89)
def
phred_13_to_18(self, new_path=None, in_place=True):
159 def phred_13_to_18(self, new_path=None, in_place=True): 160 """Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.""" 161 # New file # 162 if new_path is None: 163 new_fastq = self.__class__(new_temp_path(suffix=self.extension)) 164 else: 165 new_fastq = self.__class__(new_path) 166 # Do it # 167 self.format = 'fastq-illumina' 168 new_fastq.open('w') 169 new_fastq.handle.writelines(seq.format('fastq-sanger') for seq in self) 170 new_fastq.close() 171 self.format = 'fastq-sanger' 172 # Return # 173 if in_place: 174 os.remove(self.path) 175 shutil.move(new_fastq, self.path) 176 return self 177 else: return new_fastq
Illumina-1.3 format conversion to Illumina-1.8 format via BioPython.
def
phred_13_to_18_sed(self, new_path=None, in_place=True):
179 def phred_13_to_18_sed(self, new_path=None, in_place=True): 180 """Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).""" 181 # String # 182 sed_command = r"""4~4y/@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghi/!"#$%&'\''()*+,-.\/0123456789:;<=>?@ABCDEFGHIJ/""" 183 # Faster with bash utilities # 184 if in_place is True: 185 sh.sed('-i', sed_command, self.path) 186 return self 187 # New file # 188 if new_path is None: new_fastq = self.__class__(new_temp_path()) 189 else: new_fastq = self.__class__(new_path) 190 sh.sed(sed_command + " " + new_fastq, self.path) 191 return new_fastq
Illumina-1.3 format conversion to Illumina-1.8 format via sed (faster).
Inherited Members
- autopaths.base_path.BasePath
- BasePath
- clean_path
- path
- short_prefix
- extension
- escaped
- absolute_path
- physical_path
- relative_path
- rel_path_from
- exists
- is_symlink
- permissions
- mdate
- mdate_iso
- cdate
- cdate_iso
- unix_style
- wsl_style
- win_style
- with_tilda
- with_home
- link_from
- link_to
- symlinks_on_linux
- symlinks_on_windows
- hard_link_win_to
- fasta.core.FASTA
- buffer_size
- gzipped
- first
- lengths
- lengths_counter
- open
- close
- parse
- progress
- create
- add
- add_seq
- add_str
- add_fasta
- add_fastas
- flush
- write
- compress
- compress_slow
- compress_fast
- ids
- get_id
- sequences
- sql
- length_by_id
- subsample
- rename_with_num
- rename_with_prefix
- rename_sequences
- extract_length
- extract_sequences
- remove_trailing_stars
- remove_duplicates
- convert_U_to_T
- align
- template_align
- index_bowtie
- index_samtools
- graphs
- parse_primers
- autopaths.file_path.FilePath
- prefix_path
- prefix
- name
- filename
- directory
- count_bytes
- size
- contents
- contents_utf8
- md5
- might_be_binary
- contains_binary
- magic_number
- lines
- read
- touch
- writelines
- remove
- copy
- execute
- replace_extension
- new_name_insert
- make_directory
- must_exist
- head
- pretty_head
- tail
- pretty_tail
- move_to
- rename
- gzip_to
- gzip_internal
- gzip_external
- gzip_pigz
- ungzip_to
- ungzip_internal
- ungzip_external
- zip_to
- unzip_to
- untar_to
- untargz_to
- untargz_to_internal
- untargz_to_external
- append
- prepend
- remove_line
- remove_first_line
- replace_line
- replace_word
- sed_replace
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format_map
- maketrans