seqsearch.databases.string

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11import urllib
 12from collections import OrderedDict
 13
 14# Internal modules #
 15from seqsearch.databases import base_directory
 16
 17# First party modules #
 18from fasta import FASTA
 19from autopaths.auto_paths import AutoPaths
 20from autopaths.file_path import FilePath
 21
 22###############################################################################
 23class String(object):
 24    """
 25    The STRING database. See:
 26    http://string.embl.de/newstring_cgi/show_download_page.pl
 27    """
 28
 29    base_url = "http://string.embl.de/newstring_download/"
 30    short_name = "string"
 31
 32    all_paths = """
 33    /raw/all_proteins.fasta.gz
 34    /raw/cog_mappings.tsv.gz
 35    /unzipped/all_proteins.fasta
 36    /unzipped/cog_mappings.tsv
 37    /blast_db/all_proteins.fasta
 38    /blast_db/all_proteins.fasta.00.pin
 39    /blast_db/logfile.txt
 40    /blast_db/out.txt
 41    """
 42
 43    def __init__(self, seq_type='prot'):
 44        self.seq_type = seq_type
 45        self.base_dir = base_directory + self.short_name
 46        self.p        = AutoPaths(self.base_dir, self.all_paths)
 47
 48    @property
 49    def files_to_retrieve(self):
 50        """The files we want to download with their destinations."""
 51        result = OrderedDict()
 52        result[self.base_url + "protein.sequences.v9.1.fa.gz"] = FilePath(self.p.raw_proteins)
 53        result[self.base_url + "COG.mappings.v9.1.txt.gz"]     = FilePath(self.p.raw_mappings)
 54        return result
 55
 56    @property
 57    def files_remaining(self):
 58        """The files we haven't downloaded yet based on size checks."""
 59        get_size_http = lambda url: urllib.urlopen(url).info().getheaders("Content-Length")[0]
 60        return OrderedDict((source, dest) for source, dest in self.files_to_retrieve.items()
 61                           if dest.count_bytes != get_size_http(source))
 62
 63    def download(self):
 64        """Retrieve all files from the website"""
 65        for source, dest in self.files_remaining.items():
 66            dest.remove()
 67            urllib.urlretrieve(source, dest)
 68            dest.permissions.only_readable()
 69
 70    @property
 71    def raw_files(self):
 72        """The files we have downloaded."""
 73        return map(FilePath, self.p.raw_dir.contents)
 74
 75    def unzip(self):
 76        """Unzip them"""
 77        for f in self.raw_files: f.ungzip_to(self.p.unzipped_dir + f.prefix)
 78
 79    @property
 80    def all_proteins(self):
 81        """The main fasta file."""
 82        return FASTA(self.p.unzipped_proteins)
 83
 84    @property
 85    def mappings(self):
 86        """The cog mappings."""
 87        return FilePath(self.p.unzipped_mappings)
 88
 89    @property
 90    def blast_db(self):
 91        """A BLASTable version of the sequences."""
 92        if not self.p.blast_fasta.exists:
 93            self.p.unzipped_proteins.link_to(self.p.blast_fasta, safe=True)
 94        from seqsearch.search.blast import BLASTdb
 95        blast_db = BLASTdb(self.p.blast_fasta, 'prot')
 96        if not self.p.pin.exists:
 97            blast_db.makeblastdb(logfile=self.p.logfile, out=self.p.out)
 98        return blast_db
 99
100###############################################################################
101string = String()
class String:
24class String(object):
25    """
26    The STRING database. See:
27    http://string.embl.de/newstring_cgi/show_download_page.pl
28    """
29
30    base_url = "http://string.embl.de/newstring_download/"
31    short_name = "string"
32
33    all_paths = """
34    /raw/all_proteins.fasta.gz
35    /raw/cog_mappings.tsv.gz
36    /unzipped/all_proteins.fasta
37    /unzipped/cog_mappings.tsv
38    /blast_db/all_proteins.fasta
39    /blast_db/all_proteins.fasta.00.pin
40    /blast_db/logfile.txt
41    /blast_db/out.txt
42    """
43
44    def __init__(self, seq_type='prot'):
45        self.seq_type = seq_type
46        self.base_dir = base_directory + self.short_name
47        self.p        = AutoPaths(self.base_dir, self.all_paths)
48
49    @property
50    def files_to_retrieve(self):
51        """The files we want to download with their destinations."""
52        result = OrderedDict()
53        result[self.base_url + "protein.sequences.v9.1.fa.gz"] = FilePath(self.p.raw_proteins)
54        result[self.base_url + "COG.mappings.v9.1.txt.gz"]     = FilePath(self.p.raw_mappings)
55        return result
56
57    @property
58    def files_remaining(self):
59        """The files we haven't downloaded yet based on size checks."""
60        get_size_http = lambda url: urllib.urlopen(url).info().getheaders("Content-Length")[0]
61        return OrderedDict((source, dest) for source, dest in self.files_to_retrieve.items()
62                           if dest.count_bytes != get_size_http(source))
63
64    def download(self):
65        """Retrieve all files from the website"""
66        for source, dest in self.files_remaining.items():
67            dest.remove()
68            urllib.urlretrieve(source, dest)
69            dest.permissions.only_readable()
70
71    @property
72    def raw_files(self):
73        """The files we have downloaded."""
74        return map(FilePath, self.p.raw_dir.contents)
75
76    def unzip(self):
77        """Unzip them"""
78        for f in self.raw_files: f.ungzip_to(self.p.unzipped_dir + f.prefix)
79
80    @property
81    def all_proteins(self):
82        """The main fasta file."""
83        return FASTA(self.p.unzipped_proteins)
84
85    @property
86    def mappings(self):
87        """The cog mappings."""
88        return FilePath(self.p.unzipped_mappings)
89
90    @property
91    def blast_db(self):
92        """A BLASTable version of the sequences."""
93        if not self.p.blast_fasta.exists:
94            self.p.unzipped_proteins.link_to(self.p.blast_fasta, safe=True)
95        from seqsearch.search.blast import BLASTdb
96        blast_db = BLASTdb(self.p.blast_fasta, 'prot')
97        if not self.p.pin.exists:
98            blast_db.makeblastdb(logfile=self.p.logfile, out=self.p.out)
99        return blast_db
String(seq_type='prot')
44    def __init__(self, seq_type='prot'):
45        self.seq_type = seq_type
46        self.base_dir = base_directory + self.short_name
47        self.p        = AutoPaths(self.base_dir, self.all_paths)
files_to_retrieve

The files we want to download with their destinations.

files_remaining

The files we haven't downloaded yet based on size checks.

def download(self):
64    def download(self):
65        """Retrieve all files from the website"""
66        for source, dest in self.files_remaining.items():
67            dest.remove()
68            urllib.urlretrieve(source, dest)
69            dest.permissions.only_readable()

Retrieve all files from the website

raw_files

The files we have downloaded.

def unzip(self):
76    def unzip(self):
77        """Unzip them"""
78        for f in self.raw_files: f.ungzip_to(self.p.unzipped_dir + f.prefix)

Unzip them

all_proteins

The main fasta file.

mappings

The cog mappings.

blast_db

A BLASTable version of the sequences.