seqsearch.search.vsearch
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11 12# Internal modules # 13from autopaths.file_path import FilePath 14from seqsearch.search.core import CoreSearch 15 16# Third party modules # 17from seqsearch import sh 18 19############################################################################### 20class VSEARCHquery(CoreSearch): 21 """ 22 A vsearch job. 23 24 Example commands: 25 26 vsearch --usearch_global queries.fas --db queries.fas --id 0.6 27 --blast6out results.blast6 --maxaccepts 0 --maxrejects 0 28 29 ./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9 30 --alnout alnout.txt 31 32 Some interesting options form the manual follow: 33 34 --maxaccepts <positive integer> 35 > Maximum number of hits to accept before stopping the search. The default 36 value is 1. This option works in pair with --maxrejects. The search process 37 sorts target sequences by decreasing number of k-mers they have in common 38 with the query sequence, using that information as a proxy for sequence 39 similarity. After pairwise alignments, if the first target sequence passes 40 the acceptation criteria, it is accepted as best hit and the search process 41 stops for that query. If --maxaccepts is set to a higher value, more hits 42 are accepted. 43 """ 44 45 extension = 'vsearchout' 46 47 @property 48 def command(self): 49 # Executable # 50 if self.executable: cmd = [self.executable.path] 51 else: cmd = ['vsearch'] 52 # Other parameters 53 cmd += ['--usearch_global', self.query, 54 '--db', self.db, 55 '--blast6out', self.out_path, 56 '--threads', self.cpus] 57 # Options # 58 for k,v in self.params.items(): cmd += [k, v] 59 # Return # 60 return list(map(str, cmd)) 61 62 #-------------------------------- RUNNING --------------------------------# 63 def run(self, verbose=False): 64 """Simply run the VSEARCH search locally.""" 65 # Check the executable is available # 66 if self.executable: 67 self.executable.must_exist() 68 else: 69 from plumbing.check_cmd_found import check_cmd 70 check_cmd('vsearch') 71 # Create the output directory if it doesn't exist # 72 self.out_path.directory.create_if_not_exists() 73 # Optionally print the command # 74 if verbose: 75 print("Running VSEARCH command:\n %s" % ' '.join(self.command)) 76 # Run it # 77 cmd = sh.Command(self.command[0]) 78 result = cmd(self.command[1:], _out=self._out, _err=self._err) 79 # Return # 80 return result 81 82 #----------------------------- PARSE RESULTS -----------------------------# 83 @property 84 def results(self): 85 """ 86 Parse the results and yield biopython SearchIO entries. 87 88 Beware: 89 Some databases are not unique on the id, and this causes the parser to 90 complain about duplicate entries and raise exceptions such as: 91 92 ValueError: The ID or alternative IDs of Hit 'DQ448783' exists 93 in this QueryResult. 94 95 Summary of the columns: 96 https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 97 98 qseqid sseqid pident length mismatch gapopen qstart qend sstart send 99 evalue bitscore 100 101 Warning: Unlike BLAST results, if a sequence got no hits it is NOT 102 reported at all in VSEARCH. The number of entries yielded 103 will not match the number of sequences at input. 104 """ 105 # Import parsing library # 106 from Bio import SearchIO 107 # Avoid the warning # 108 import warnings 109 warnings.filterwarnings("ignore", 'BiopythonDeprecationWarning') 110 # Iterate over lines # 111 with open(self.out_path, 'rt') as handle: 112 for entry in SearchIO.parse(handle, 'blast-tab'): 113 yield entry 114 115############################################################################### 116class VSEARCHdb(FilePath): 117 """A VSEARCH database one can search against.""" 118 119 def __repr__(self): 120 return '<%s on "%s">' % (self.__class__.__name__, self.path) 121 122 def __init__(self, path): 123 # Call parent constructor # 124 FilePath.__init__(self, path) 125 # Check if the corresponding FASTA exists # 126 self.fasta_path = self.replace_extension('fasta') 127 128 def __bool__(self): 129 """Does the indexed database actually exist?""" 130 return bool(self.replace_extension('udb')) 131 132 def create_if_not_exists(self, *args, **kwargs): 133 """If the indexed database has not been generated, generate it.""" 134 if not self: return self.makedb(*args, **kwargs) 135 136 def makedb(self, output=None, stdout=None, verbose=False): 137 # We need a corresponding fasta that exists # 138 if not self.fasta_path: 139 raise Exception("No fasta file at '%s'." % self.fasta_path) 140 # Message # 141 if verbose: 142 msg = "Calling `makeudb_usearch` from '%s' to '%s'." 143 print(msg % (self.fasta_path, self)) 144 # The output # 145 if output is None: output = self 146 # Options # 147 options = ['--makeudb_usearch', self.fasta_path, '--output', output] 148 # Call the program # 149 sh.vsearch(*options, _out=stdout) 150 # Return # 151 return output
21class VSEARCHquery(CoreSearch): 22 """ 23 A vsearch job. 24 25 Example commands: 26 27 vsearch --usearch_global queries.fas --db queries.fas --id 0.6 28 --blast6out results.blast6 --maxaccepts 0 --maxrejects 0 29 30 ./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9 31 --alnout alnout.txt 32 33 Some interesting options form the manual follow: 34 35 --maxaccepts <positive integer> 36 > Maximum number of hits to accept before stopping the search. The default 37 value is 1. This option works in pair with --maxrejects. The search process 38 sorts target sequences by decreasing number of k-mers they have in common 39 with the query sequence, using that information as a proxy for sequence 40 similarity. After pairwise alignments, if the first target sequence passes 41 the acceptation criteria, it is accepted as best hit and the search process 42 stops for that query. If --maxaccepts is set to a higher value, more hits 43 are accepted. 44 """ 45 46 extension = 'vsearchout' 47 48 @property 49 def command(self): 50 # Executable # 51 if self.executable: cmd = [self.executable.path] 52 else: cmd = ['vsearch'] 53 # Other parameters 54 cmd += ['--usearch_global', self.query, 55 '--db', self.db, 56 '--blast6out', self.out_path, 57 '--threads', self.cpus] 58 # Options # 59 for k,v in self.params.items(): cmd += [k, v] 60 # Return # 61 return list(map(str, cmd)) 62 63 #-------------------------------- RUNNING --------------------------------# 64 def run(self, verbose=False): 65 """Simply run the VSEARCH search locally.""" 66 # Check the executable is available # 67 if self.executable: 68 self.executable.must_exist() 69 else: 70 from plumbing.check_cmd_found import check_cmd 71 check_cmd('vsearch') 72 # Create the output directory if it doesn't exist # 73 self.out_path.directory.create_if_not_exists() 74 # Optionally print the command # 75 if verbose: 76 print("Running VSEARCH command:\n %s" % ' '.join(self.command)) 77 # Run it # 78 cmd = sh.Command(self.command[0]) 79 result = cmd(self.command[1:], _out=self._out, _err=self._err) 80 # Return # 81 return result 82 83 #----------------------------- PARSE RESULTS -----------------------------# 84 @property 85 def results(self): 86 """ 87 Parse the results and yield biopython SearchIO entries. 88 89 Beware: 90 Some databases are not unique on the id, and this causes the parser to 91 complain about duplicate entries and raise exceptions such as: 92 93 ValueError: The ID or alternative IDs of Hit 'DQ448783' exists 94 in this QueryResult. 95 96 Summary of the columns: 97 https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 98 99 qseqid sseqid pident length mismatch gapopen qstart qend sstart send 100 evalue bitscore 101 102 Warning: Unlike BLAST results, if a sequence got no hits it is NOT 103 reported at all in VSEARCH. The number of entries yielded 104 will not match the number of sequences at input. 105 """ 106 # Import parsing library # 107 from Bio import SearchIO 108 # Avoid the warning # 109 import warnings 110 warnings.filterwarnings("ignore", 'BiopythonDeprecationWarning') 111 # Iterate over lines # 112 with open(self.out_path, 'rt') as handle: 113 for entry in SearchIO.parse(handle, 'blast-tab'): 114 yield entry
A vsearch job.
Example commands:
vsearch --usearch_global queries.fas --db queries.fas --id 0.6
--blast6out results.blast6 --maxaccepts 0 --maxrejects 0
./vsearch --usearch_global queries.fsa --db database.fsa --id 0.9
--alnout alnout.txt
Some interesting options form the manual follow:
--maxaccepts
Maximum number of hits to accept before stopping the search. The default value is 1. This option works in pair with --maxrejects. The search process sorts target sequences by decreasing number of k-mers they have in common with the query sequence, using that information as a proxy for sequence similarity. After pairwise alignments, if the first target sequence passes the acceptation criteria, it is accepted as best hit and the search process stops for that query. If --maxaccepts is set to a higher value, more hits are accepted.
64 def run(self, verbose=False): 65 """Simply run the VSEARCH search locally.""" 66 # Check the executable is available # 67 if self.executable: 68 self.executable.must_exist() 69 else: 70 from plumbing.check_cmd_found import check_cmd 71 check_cmd('vsearch') 72 # Create the output directory if it doesn't exist # 73 self.out_path.directory.create_if_not_exists() 74 # Optionally print the command # 75 if verbose: 76 print("Running VSEARCH command:\n %s" % ' '.join(self.command)) 77 # Run it # 78 cmd = sh.Command(self.command[0]) 79 result = cmd(self.command[1:], _out=self._out, _err=self._err) 80 # Return # 81 return result
Simply run the VSEARCH search locally.
Parse the results and yield biopython SearchIO entries.
Beware: Some databases are not unique on the id, and this causes the parser to complain about duplicate entries and raise exceptions such as:
ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
in this QueryResult.
Summary of the columns: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6
qseqid sseqid pident length mismatch gapopen qstart qend sstart send
evalue bitscore
Warning: Unlike BLAST results, if a sequence got no hits it is NOT reported at all in VSEARCH. The number of entries yielded will not match the number of sequences at input.
Inherited Members
117class VSEARCHdb(FilePath): 118 """A VSEARCH database one can search against.""" 119 120 def __repr__(self): 121 return '<%s on "%s">' % (self.__class__.__name__, self.path) 122 123 def __init__(self, path): 124 # Call parent constructor # 125 FilePath.__init__(self, path) 126 # Check if the corresponding FASTA exists # 127 self.fasta_path = self.replace_extension('fasta') 128 129 def __bool__(self): 130 """Does the indexed database actually exist?""" 131 return bool(self.replace_extension('udb')) 132 133 def create_if_not_exists(self, *args, **kwargs): 134 """If the indexed database has not been generated, generate it.""" 135 if not self: return self.makedb(*args, **kwargs) 136 137 def makedb(self, output=None, stdout=None, verbose=False): 138 # We need a corresponding fasta that exists # 139 if not self.fasta_path: 140 raise Exception("No fasta file at '%s'." % self.fasta_path) 141 # Message # 142 if verbose: 143 msg = "Calling `makeudb_usearch` from '%s' to '%s'." 144 print(msg % (self.fasta_path, self)) 145 # The output # 146 if output is None: output = self 147 # Options # 148 options = ['--makeudb_usearch', self.fasta_path, '--output', output] 149 # Call the program # 150 sh.vsearch(*options, _out=stdout) 151 # Return # 152 return output
A VSEARCH database one can search against.
59 def __new__(cls, path, *args, **kwargs): 60 """A Path object is in fact a string.""" 61 return str.__new__(cls, cls.clean_path(path))
A Path object is in fact a string.
133 def create_if_not_exists(self, *args, **kwargs): 134 """If the indexed database has not been generated, generate it.""" 135 if not self: return self.makedb(*args, **kwargs)
If the indexed database has not been generated, generate it.
137 def makedb(self, output=None, stdout=None, verbose=False): 138 # We need a corresponding fasta that exists # 139 if not self.fasta_path: 140 raise Exception("No fasta file at '%s'." % self.fasta_path) 141 # Message # 142 if verbose: 143 msg = "Calling `makeudb_usearch` from '%s' to '%s'." 144 print(msg % (self.fasta_path, self)) 145 # The output # 146 if output is None: output = self 147 # Options # 148 options = ['--makeudb_usearch', self.fasta_path, '--output', output] 149 # Call the program # 150 sh.vsearch(*options, _out=stdout) 151 # Return # 152 return output
Inherited Members
- autopaths.file_path.FilePath
- first
- prefix_path
- prefix
- name
- filename
- directory
- count_bytes
- count
- size
- contents
- contents_utf8
- md5
- might_be_binary
- contains_binary
- magic_number
- lines
- read
- create
- touch
- open
- add_str
- close
- write
- writelines
- remove
- copy
- execute
- replace_extension
- new_name_insert
- make_directory
- must_exist
- head
- tail
- move_to
- rename
- gzip_to
- gzip_internal
- gzip_external
- gzip_pigz
- ungzip_to
- ungzip_internal
- ungzip_external
- zip_to
- unzip_to
- untar_to
- untargz_to
- untargz_to_internal
- untargz_to_external
- append
- prepend
- remove_line
- remove_first_line
- replace_line
- replace_word
- sed_replace
- autopaths.base_path.BasePath
- clean_path
- short_prefix
- extension
- escaped
- absolute_path
- physical_path
- relative_path
- rel_path_from
- exists
- is_symlink
- permissions
- mdate
- mdate_iso
- cdate
- cdate_iso
- unix_style
- wsl_style
- win_style
- with_tilda
- with_home
- link_from
- link_to
- symlinks_on_linux
- symlinks_on_windows
- hard_link_win_to
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format
- format_map
- maketrans