seqsearch.databases

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import os, fnmatch
 12from collections import OrderedDict, Counter
 13
 14# First party modules #
 15from fasta import FASTA
 16from autopaths.auto_paths import AutoPaths
 17from autopaths.dir_path   import DirectoryPath
 18from autopaths.file_path  import FilePath
 19from plumbing.cache       import property_cached
 20from plumbing.common      import natural_sort
 21
 22# Third party modules #
 23from tqdm import tqdm
 24
 25# Constants #
 26home = os.environ.get('HOME', '~') + '/'
 27base_directory = home + "databases/"
 28
 29###############################################################################
 30class Database:
 31    """General database object to inherit from."""
 32
 33    all_paths = """
 34    /raw/
 35    /unzipped/
 36    /blast_db/
 37    """
 38
 39    def __init__(self, seq_type=None, base_dir=None):
 40        # The sequence type is either 'prot' or 'nucl' #
 41        self.seq_type = seq_type
 42        # The default base directory #
 43        if base_dir is None:
 44            base_dir = os.environ.get('HOME', '/') + '/'
 45        # Make base_dir object #
 46        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
 47        self.base_dir = DirectoryPath(self.base_dir)
 48        # Make autopaths object #
 49        self.autopaths = AutoPaths(self.base_dir, self.all_paths)
 50
 51    def __bool__(self):
 52        """
 53        Return True if the database was already downloaded and the
 54        results are stored on the filesystem. Return False otherwise.
 55        """
 56        return not self.autopaths.unzipped_dir.empty
 57
 58    @property_cached
 59    def ftp(self):
 60        """If the data is to be obtained by FTP, here is the ftputil object."""
 61        from ftputil import FTPHost
 62        ftp = FTPHost(self.ftp_url, "anonymous")
 63        ftp.chdir(self.ftp_dir)
 64        return ftp
 65
 66    @property_cached
 67    def files_to_retrieve(self):
 68        """The files we want to download with their destinations."""
 69        if hasattr(self, "pattern"):
 70            files = self.ftp.listdir(self.ftp.curdir)
 71            files.sort(key=natural_sort)
 72            return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in files
 73                               if fnmatch.fnmatch(f, self.pattern))
 74        if hasattr(self, "files"):
 75            return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in self.files)
 76
 77    @property
 78    def files_remaining(self):
 79        """The files we haven't downloaded yet based on size checks."""
 80        return OrderedDict((source,dest) for source, dest in self.files_to_retrieve.items()
 81                           if dest.count_bytes != self.ftp.path.getsize(source))
 82
 83    def download(self):
 84        """Retrieve all files from the FTP site."""
 85        # Create the directory #
 86        self.base_dir.create_if_not_exists()
 87        # Loop over files #
 88        for source, dest in tqdm(self.files_remaining.items()):
 89            dest.remove()
 90            self.ftp.download(source, dest)
 91            dest.permissions.only_readable()
 92
 93    @property
 94    def raw_files(self):
 95        """The files we have downloaded."""
 96        return map(FASTA, self.autopaths.raw_dir.contents)
 97
 98    def ungzip(self):
 99        """Ungzip them."""
100        # Gzip #
101        for f in tqdm(self.raw_files):
102            destination = self.autopaths.unzipped_dir+f.prefix
103            f.ungzip_to(destination)
104            destination.permissions.only_readable()
105
106    def untargz(self):
107        """Untargzip them."""
108        # Gzip #
109        for f in tqdm(self.raw_files): f.untargz_to(self.autopaths.unzipped_dir)
110        for f in self.autopaths.unzipped_dir: f.permissions.only_readable()
111
112    @property
113    def sequences(self):
114        """All the sequences from all the raw files."""
115        for fasta in self.raw_files:
116            for seq in fasta: yield seq
117
118    #------------------ Only for preformatted BLAST databases ----------------#
119    @property_cached
120    def blast_db(self):
121        """A BLASTable version of the sequences."""
122        # Import #
123        from seqsearch.search.blast import BLASTdb
124        # Create object #
125        db = BLASTdb(self.autopaths.unzipped_dir + self.db_name,
126                     self.seq_type)
127        # Return #
128        return db
129
130    #--------------------- Only for taxonomic databases ----------------------#
131    @property_cached
132    def tax_depth_freq(self):
133        def depths():
134            with open(self.taxonomy, 'r') as handle:
135                for line in handle:
136                    line = line.strip('\n')
137                    otu_name, species = line.split('\t')
138                    yield len(species.split(';'))
139        return Counter(depths())
class Database:
 31class Database:
 32    """General database object to inherit from."""
 33
 34    all_paths = """
 35    /raw/
 36    /unzipped/
 37    /blast_db/
 38    """
 39
 40    def __init__(self, seq_type=None, base_dir=None):
 41        # The sequence type is either 'prot' or 'nucl' #
 42        self.seq_type = seq_type
 43        # The default base directory #
 44        if base_dir is None:
 45            base_dir = os.environ.get('HOME', '/') + '/'
 46        # Make base_dir object #
 47        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
 48        self.base_dir = DirectoryPath(self.base_dir)
 49        # Make autopaths object #
 50        self.autopaths = AutoPaths(self.base_dir, self.all_paths)
 51
 52    def __bool__(self):
 53        """
 54        Return True if the database was already downloaded and the
 55        results are stored on the filesystem. Return False otherwise.
 56        """
 57        return not self.autopaths.unzipped_dir.empty
 58
 59    @property_cached
 60    def ftp(self):
 61        """If the data is to be obtained by FTP, here is the ftputil object."""
 62        from ftputil import FTPHost
 63        ftp = FTPHost(self.ftp_url, "anonymous")
 64        ftp.chdir(self.ftp_dir)
 65        return ftp
 66
 67    @property_cached
 68    def files_to_retrieve(self):
 69        """The files we want to download with their destinations."""
 70        if hasattr(self, "pattern"):
 71            files = self.ftp.listdir(self.ftp.curdir)
 72            files.sort(key=natural_sort)
 73            return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in files
 74                               if fnmatch.fnmatch(f, self.pattern))
 75        if hasattr(self, "files"):
 76            return OrderedDict((f, FilePath(self.autopaths.raw_dir+f)) for f in self.files)
 77
 78    @property
 79    def files_remaining(self):
 80        """The files we haven't downloaded yet based on size checks."""
 81        return OrderedDict((source,dest) for source, dest in self.files_to_retrieve.items()
 82                           if dest.count_bytes != self.ftp.path.getsize(source))
 83
 84    def download(self):
 85        """Retrieve all files from the FTP site."""
 86        # Create the directory #
 87        self.base_dir.create_if_not_exists()
 88        # Loop over files #
 89        for source, dest in tqdm(self.files_remaining.items()):
 90            dest.remove()
 91            self.ftp.download(source, dest)
 92            dest.permissions.only_readable()
 93
 94    @property
 95    def raw_files(self):
 96        """The files we have downloaded."""
 97        return map(FASTA, self.autopaths.raw_dir.contents)
 98
 99    def ungzip(self):
100        """Ungzip them."""
101        # Gzip #
102        for f in tqdm(self.raw_files):
103            destination = self.autopaths.unzipped_dir+f.prefix
104            f.ungzip_to(destination)
105            destination.permissions.only_readable()
106
107    def untargz(self):
108        """Untargzip them."""
109        # Gzip #
110        for f in tqdm(self.raw_files): f.untargz_to(self.autopaths.unzipped_dir)
111        for f in self.autopaths.unzipped_dir: f.permissions.only_readable()
112
113    @property
114    def sequences(self):
115        """All the sequences from all the raw files."""
116        for fasta in self.raw_files:
117            for seq in fasta: yield seq
118
119    #------------------ Only for preformatted BLAST databases ----------------#
120    @property_cached
121    def blast_db(self):
122        """A BLASTable version of the sequences."""
123        # Import #
124        from seqsearch.search.blast import BLASTdb
125        # Create object #
126        db = BLASTdb(self.autopaths.unzipped_dir + self.db_name,
127                     self.seq_type)
128        # Return #
129        return db
130
131    #--------------------- Only for taxonomic databases ----------------------#
132    @property_cached
133    def tax_depth_freq(self):
134        def depths():
135            with open(self.taxonomy, 'r') as handle:
136                for line in handle:
137                    line = line.strip('\n')
138                    otu_name, species = line.split('\t')
139                    yield len(species.split(';'))
140        return Counter(depths())

General database object to inherit from.

Database(seq_type=None, base_dir=None)
40    def __init__(self, seq_type=None, base_dir=None):
41        # The sequence type is either 'prot' or 'nucl' #
42        self.seq_type = seq_type
43        # The default base directory #
44        if base_dir is None:
45            base_dir = os.environ.get('HOME', '/') + '/'
46        # Make base_dir object #
47        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
48        self.base_dir = DirectoryPath(self.base_dir)
49        # Make autopaths object #
50        self.autopaths = AutoPaths(self.base_dir, self.all_paths)
ftp

If the data is to be obtained by FTP, here is the ftputil object.

files_to_retrieve

The files we want to download with their destinations.

files_remaining

The files we haven't downloaded yet based on size checks.

def download(self):
84    def download(self):
85        """Retrieve all files from the FTP site."""
86        # Create the directory #
87        self.base_dir.create_if_not_exists()
88        # Loop over files #
89        for source, dest in tqdm(self.files_remaining.items()):
90            dest.remove()
91            self.ftp.download(source, dest)
92            dest.permissions.only_readable()

Retrieve all files from the FTP site.

raw_files

The files we have downloaded.

def ungzip(self):
 99    def ungzip(self):
100        """Ungzip them."""
101        # Gzip #
102        for f in tqdm(self.raw_files):
103            destination = self.autopaths.unzipped_dir+f.prefix
104            f.ungzip_to(destination)
105            destination.permissions.only_readable()

Ungzip them.

def untargz(self):
107    def untargz(self):
108        """Untargzip them."""
109        # Gzip #
110        for f in tqdm(self.raw_files): f.untargz_to(self.autopaths.unzipped_dir)
111        for f in self.autopaths.unzipped_dir: f.permissions.only_readable()

Untargzip them.

sequences

All the sequences from all the raw files.

blast_db

A BLASTable version of the sequences.