seqsearch.databases
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11import os, fnmatch 12from collections import OrderedDict, Counter 13 14# First party modules # 15from fasta import FASTA 16from autopaths.auto_paths import AutoPaths 17from autopaths.dir_path import DirectoryPath 18from autopaths.file_path import FilePath 19from plumbing.cache import property_cached 20from plumbing.common import natural_sort 21 22# Third party modules # 23from tqdm import tqdm 24 25# Constants # 26home = os.environ.get('HOME', '~') + '/' 27base_directory = home + "databases/" 28 29############################################################################### 30class Database: 31 """General database object to inherit from.""" 32 33 all_paths = """ 34 /raw/ 35 /unzipped/ 36 /blast_db/ 37 """ 38 39 def __init__(self, seq_type=None, base_dir=None): 40 # The sequence type is either 'prot' or 'nucl' # 41 self.seq_type = seq_type 42 # The default base directory # 43 if base_dir is None: 44 base_dir = os.environ.get('HOME', '/') + '/' 45 # Make base_dir object # 46 self.base_dir = base_dir + 'databases/' + self.short_name + '/' 47 self.base_dir = DirectoryPath(self.base_dir) 48 # Make autopaths object # 49 self.autopaths = AutoPaths(self.base_dir, self.all_paths) 50 51 def __bool__(self): 52 """ 53 Return True if the database was already downloaded and the 54 results are stored on the filesystem. Return False otherwise. 55 """ 56 return not self.autopaths.unzipped_dir.empty 57 58 @property_cached 59 def ftp(self): 60 """If the data is to be obtained by FTP, here is the ftputil object.""" 61 from ftputil import FTPHost 62 ftp = FTPHost(self.ftp_url, "anonymous") 63 ftp.chdir(self.ftp_dir) 64 return ftp 65 66 @property_cached 67 def files_to_retrieve(self): 68 """The files we want to download with their destinations.""" 69 if hasattr(self, "pattern"): 70 files = self.ftp.listdir(self.ftp.curdir) 71 files.sort(key=natural_sort) 72 return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in files 73 if fnmatch.fnmatch(f, self.pattern)) 74 if hasattr(self, "files"): 75 return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in self.files) 76 77 @property 78 def files_remaining(self): 79 """The files we haven't downloaded yet based on size checks.""" 80 return OrderedDict((source,dest) for source, dest in self.files_to_retrieve.items() 81 if dest.count_bytes != self.ftp.path.getsize(source)) 82 83 def download(self): 84 """Retrieve all files from the FTP site.""" 85 # Create the directory # 86 self.base_dir.create_if_not_exists() 87 # Loop over files # 88 for source, dest in tqdm(self.files_remaining.items()): 89 dest.remove() 90 self.ftp.download(source, dest) 91 dest.permissions.only_readable() 92 93 @property 94 def raw_files(self): 95 """The files we have downloaded.""" 96 return map(FASTA, self.autopaths.raw_dir.contents) 97 98 def ungzip(self): 99 """Ungzip them.""" 100 # Gzip # 101 for f in tqdm(self.raw_files): 102 destination = self.autopaths.unzipped_dir+f.prefix 103 f.ungzip_to(destination) 104 destination.permissions.only_readable() 105 106 def untargz(self): 107 """Untargzip them.""" 108 # Gzip # 109 for f in tqdm(self.raw_files): f.untargz_to(self.autopaths.unzipped_dir) 110 for f in self.autopaths.unzipped_dir: f.permissions.only_readable() 111 112 @property 113 def sequences(self): 114 """All the sequences from all the raw files.""" 115 for fasta in self.raw_files: 116 for seq in fasta: yield seq 117 118 #------------------ Only for preformatted BLAST databases ----------------# 119 @property_cached 120 def blast_db(self): 121 """A BLASTable version of the sequences.""" 122 # Import # 123 from seqsearch.search.blast import BLASTdb 124 # Create object # 125 db = BLASTdb(self.autopaths.unzipped_dir + self.db_name, 126 self.seq_type) 127 # Return # 128 return db 129 130 #--------------------- Only for taxonomic databases ----------------------# 131 @property_cached 132 def tax_depth_freq(self): 133 def depths(): 134 with open(self.taxonomy, 'r') as handle: 135 for line in handle: 136 line = line.strip('\n') 137 otu_name, species = line.split('\t') 138 yield len(species.split(';')) 139 return Counter(depths())
class
Database:
31class Database: 32 """General database object to inherit from.""" 33 34 all_paths = """ 35 /raw/ 36 /unzipped/ 37 /blast_db/ 38 """ 39 40 def __init__(self, seq_type=None, base_dir=None): 41 # The sequence type is either 'prot' or 'nucl' # 42 self.seq_type = seq_type 43 # The default base directory # 44 if base_dir is None: 45 base_dir = os.environ.get('HOME', '/') + '/' 46 # Make base_dir object # 47 self.base_dir = base_dir + 'databases/' + self.short_name + '/' 48 self.base_dir = DirectoryPath(self.base_dir) 49 # Make autopaths object # 50 self.autopaths = AutoPaths(self.base_dir, self.all_paths) 51 52 def __bool__(self): 53 """ 54 Return True if the database was already downloaded and the 55 results are stored on the filesystem. Return False otherwise. 56 """ 57 return not self.autopaths.unzipped_dir.empty 58 59 @property_cached 60 def ftp(self): 61 """If the data is to be obtained by FTP, here is the ftputil object.""" 62 from ftputil import FTPHost 63 ftp = FTPHost(self.ftp_url, "anonymous") 64 ftp.chdir(self.ftp_dir) 65 return ftp 66 67 @property_cached 68 def files_to_retrieve(self): 69 """The files we want to download with their destinations.""" 70 if hasattr(self, "pattern"): 71 files = self.ftp.listdir(self.ftp.curdir) 72 files.sort(key=natural_sort) 73 return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in files 74 if fnmatch.fnmatch(f, self.pattern)) 75 if hasattr(self, "files"): 76 return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in self.files) 77 78 @property 79 def files_remaining(self): 80 """The files we haven't downloaded yet based on size checks.""" 81 return OrderedDict((source,dest) for source, dest in self.files_to_retrieve.items() 82 if dest.count_bytes != self.ftp.path.getsize(source)) 83 84 def download(self): 85 """Retrieve all files from the FTP site.""" 86 # Create the directory # 87 self.base_dir.create_if_not_exists() 88 # Loop over files # 89 for source, dest in tqdm(self.files_remaining.items()): 90 dest.remove() 91 self.ftp.download(source, dest) 92 dest.permissions.only_readable() 93 94 @property 95 def raw_files(self): 96 """The files we have downloaded.""" 97 return map(FASTA, self.autopaths.raw_dir.contents) 98 99 def ungzip(self): 100 """Ungzip them.""" 101 # Gzip # 102 for f in tqdm(self.raw_files): 103 destination = self.autopaths.unzipped_dir+f.prefix 104 f.ungzip_to(destination) 105 destination.permissions.only_readable() 106 107 def untargz(self): 108 """Untargzip them.""" 109 # Gzip # 110 for f in tqdm(self.raw_files): f.untargz_to(self.autopaths.unzipped_dir) 111 for f in self.autopaths.unzipped_dir: f.permissions.only_readable() 112 113 @property 114 def sequences(self): 115 """All the sequences from all the raw files.""" 116 for fasta in self.raw_files: 117 for seq in fasta: yield seq 118 119 #------------------ Only for preformatted BLAST databases ----------------# 120 @property_cached 121 def blast_db(self): 122 """A BLASTable version of the sequences.""" 123 # Import # 124 from seqsearch.search.blast import BLASTdb 125 # Create object # 126 db = BLASTdb(self.autopaths.unzipped_dir + self.db_name, 127 self.seq_type) 128 # Return # 129 return db 130 131 #--------------------- Only for taxonomic databases ----------------------# 132 @property_cached 133 def tax_depth_freq(self): 134 def depths(): 135 with open(self.taxonomy, 'r') as handle: 136 for line in handle: 137 line = line.strip('\n') 138 otu_name, species = line.split('\t') 139 yield len(species.split(';')) 140 return Counter(depths())
General database object to inherit from.
Database(seq_type=None, base_dir=None)
40 def __init__(self, seq_type=None, base_dir=None): 41 # The sequence type is either 'prot' or 'nucl' # 42 self.seq_type = seq_type 43 # The default base directory # 44 if base_dir is None: 45 base_dir = os.environ.get('HOME', '/') + '/' 46 # Make base_dir object # 47 self.base_dir = base_dir + 'databases/' + self.short_name + '/' 48 self.base_dir = DirectoryPath(self.base_dir) 49 # Make autopaths object # 50 self.autopaths = AutoPaths(self.base_dir, self.all_paths)
def
download(self):
84 def download(self): 85 """Retrieve all files from the FTP site.""" 86 # Create the directory # 87 self.base_dir.create_if_not_exists() 88 # Loop over files # 89 for source, dest in tqdm(self.files_remaining.items()): 90 dest.remove() 91 self.ftp.download(source, dest) 92 dest.permissions.only_readable()
Retrieve all files from the FTP site.
def
ungzip(self):
99 def ungzip(self): 100 """Ungzip them.""" 101 # Gzip # 102 for f in tqdm(self.raw_files): 103 destination = self.autopaths.unzipped_dir+f.prefix 104 f.ungzip_to(destination) 105 destination.permissions.only_readable()
Ungzip them.