seqsearch.databases.string
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11import urllib 12from collections import OrderedDict 13 14# Internal modules # 15from seqsearch.databases import base_directory 16 17# First party modules # 18from fasta import FASTA 19from autopaths.auto_paths import AutoPaths 20from autopaths.file_path import FilePath 21 22############################################################################### 23class String(object): 24 """ 25 The STRING database. See: 26 http://string.embl.de/newstring_cgi/show_download_page.pl 27 """ 28 29 base_url = "http://string.embl.de/newstring_download/" 30 short_name = "string" 31 32 all_paths = """ 33 /raw/all_proteins.fasta.gz 34 /raw/cog_mappings.tsv.gz 35 /unzipped/all_proteins.fasta 36 /unzipped/cog_mappings.tsv 37 /blast_db/all_proteins.fasta 38 /blast_db/all_proteins.fasta.00.pin 39 /blast_db/logfile.txt 40 /blast_db/out.txt 41 """ 42 43 def __init__(self, seq_type='prot'): 44 self.seq_type = seq_type 45 self.base_dir = base_directory + self.short_name 46 self.p = AutoPaths(self.base_dir, self.all_paths) 47 48 @property 49 def files_to_retrieve(self): 50 """The files we want to download with their destinations.""" 51 result = OrderedDict() 52 result[self.base_url + "protein.sequences.v9.1.fa.gz"] = FilePath(self.p.raw_proteins) 53 result[self.base_url + "COG.mappings.v9.1.txt.gz"] = FilePath(self.p.raw_mappings) 54 return result 55 56 @property 57 def files_remaining(self): 58 """The files we haven't downloaded yet based on size checks.""" 59 get_size_http = lambda url: urllib.urlopen(url).info().getheaders("Content-Length")[0] 60 return OrderedDict((source, dest) for source, dest in self.files_to_retrieve.items() 61 if dest.count_bytes != get_size_http(source)) 62 63 def download(self): 64 """Retrieve all files from the website""" 65 for source, dest in self.files_remaining.items(): 66 dest.remove() 67 urllib.urlretrieve(source, dest) 68 dest.permissions.only_readable() 69 70 @property 71 def raw_files(self): 72 """The files we have downloaded.""" 73 return map(FilePath, self.p.raw_dir.contents) 74 75 def unzip(self): 76 """Unzip them""" 77 for f in self.raw_files: f.ungzip_to(self.p.unzipped_dir + f.prefix) 78 79 @property 80 def all_proteins(self): 81 """The main fasta file.""" 82 return FASTA(self.p.unzipped_proteins) 83 84 @property 85 def mappings(self): 86 """The cog mappings.""" 87 return FilePath(self.p.unzipped_mappings) 88 89 @property 90 def blast_db(self): 91 """A BLASTable version of the sequences.""" 92 if not self.p.blast_fasta.exists: 93 self.p.unzipped_proteins.link_to(self.p.blast_fasta, safe=True) 94 from seqsearch.search.blast import BLASTdb 95 blast_db = BLASTdb(self.p.blast_fasta, 'prot') 96 if not self.p.pin.exists: 97 blast_db.makeblastdb(logfile=self.p.logfile, out=self.p.out) 98 return blast_db 99 100############################################################################### 101string = String()
class
String:
24class String(object): 25 """ 26 The STRING database. See: 27 http://string.embl.de/newstring_cgi/show_download_page.pl 28 """ 29 30 base_url = "http://string.embl.de/newstring_download/" 31 short_name = "string" 32 33 all_paths = """ 34 /raw/all_proteins.fasta.gz 35 /raw/cog_mappings.tsv.gz 36 /unzipped/all_proteins.fasta 37 /unzipped/cog_mappings.tsv 38 /blast_db/all_proteins.fasta 39 /blast_db/all_proteins.fasta.00.pin 40 /blast_db/logfile.txt 41 /blast_db/out.txt 42 """ 43 44 def __init__(self, seq_type='prot'): 45 self.seq_type = seq_type 46 self.base_dir = base_directory + self.short_name 47 self.p = AutoPaths(self.base_dir, self.all_paths) 48 49 @property 50 def files_to_retrieve(self): 51 """The files we want to download with their destinations.""" 52 result = OrderedDict() 53 result[self.base_url + "protein.sequences.v9.1.fa.gz"] = FilePath(self.p.raw_proteins) 54 result[self.base_url + "COG.mappings.v9.1.txt.gz"] = FilePath(self.p.raw_mappings) 55 return result 56 57 @property 58 def files_remaining(self): 59 """The files we haven't downloaded yet based on size checks.""" 60 get_size_http = lambda url: urllib.urlopen(url).info().getheaders("Content-Length")[0] 61 return OrderedDict((source, dest) for source, dest in self.files_to_retrieve.items() 62 if dest.count_bytes != get_size_http(source)) 63 64 def download(self): 65 """Retrieve all files from the website""" 66 for source, dest in self.files_remaining.items(): 67 dest.remove() 68 urllib.urlretrieve(source, dest) 69 dest.permissions.only_readable() 70 71 @property 72 def raw_files(self): 73 """The files we have downloaded.""" 74 return map(FilePath, self.p.raw_dir.contents) 75 76 def unzip(self): 77 """Unzip them""" 78 for f in self.raw_files: f.ungzip_to(self.p.unzipped_dir + f.prefix) 79 80 @property 81 def all_proteins(self): 82 """The main fasta file.""" 83 return FASTA(self.p.unzipped_proteins) 84 85 @property 86 def mappings(self): 87 """The cog mappings.""" 88 return FilePath(self.p.unzipped_mappings) 89 90 @property 91 def blast_db(self): 92 """A BLASTable version of the sequences.""" 93 if not self.p.blast_fasta.exists: 94 self.p.unzipped_proteins.link_to(self.p.blast_fasta, safe=True) 95 from seqsearch.search.blast import BLASTdb 96 blast_db = BLASTdb(self.p.blast_fasta, 'prot') 97 if not self.p.pin.exists: 98 blast_db.makeblastdb(logfile=self.p.logfile, out=self.p.out) 99 return blast_db
The STRING database. See: http://string.embl.de/newstring_cgi/show_download_page.pl
def
download(self):
64 def download(self): 65 """Retrieve all files from the website""" 66 for source, dest in self.files_remaining.items(): 67 dest.remove() 68 urllib.urlretrieve(source, dest) 69 dest.permissions.only_readable()
Retrieve all files from the website