seqsearch.databases.pfam
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11 12# First party modules # 13from seqsearch.databases import Database 14from autopaths.auto_paths import AutoPaths 15from autopaths.dir_path import DirectoryPath 16from plumbing.cache import property_cached 17from fasta import FASTA 18 19# Module for launching shell commands # 20from seqsearch import sh 21 22############################################################################### 23class Pfam(Database): 24 """The Pfam database is a large collection of protein families, 25 each represented by multiple sequence alignments and HMMs. 26 27 http://pfam.xfam.org 28 29 Pfam-A is the manually curated portion of the database that 30 contains over 16,000 entries. An automatically generated supplement 31 is provided called Pfam-B. Pfam-B was discontinued. 32 33 To install: 34 from seqsearch.databases.pfam import pfam 35 pfam.download() 36 pfam.ungzip() 37 38 It will put it in ~/databases/pfam 39 """ 40 41 all_paths = """ 42 /raw/ 43 /unzipped/Pfam-A.hmm 44 /unzipped/Pfam-A.fasta 45 /unzipped/Pfam-A.seed 46 /specific/ 47 """ 48 49 short_name = "pfam" 50 ftp_url = "ftp.ebi.ac.uk" 51 ftp_dir = "/pub/databases/Pfam/current_release/" 52 files = ("Pfam-A.hmm.gz", "Pfam-A.fasta.gz", "Pfam-A.seed.gz") 53 54 @property_cached 55 def hmm_db(self): 56 hmm_db = self.autopaths.hmm 57 hmm_db.seq_type = 'hmm_prot' 58 return hmm_db 59 60 @property_cached 61 def fasta(self): 62 fasta = FASTA(self.autopaths.fasta) 63 return fasta 64 65 @property_cached 66 def seeds(self): 67 seeds = FASTA(self.autopaths.seed) 68 return seeds 69 70############################################################################### 71pfam = Pfam("hmm_prot") 72 73############################################################################### 74class SpecificFamily(object): 75 """When you are interested in having an HMM 'database' with only 76 one specific Pfam in it. As well as the associated proteins that 77 are part of it.""" 78 79 all_paths = """ 80 /model.hmm 81 /proteins.fasta 82 /subsampled.fasta 83 """ 84 85 def __init__(self, fam_name): 86 self.fam_name = fam_name 87 self.base_dir = DirectoryPath(pfam.autopaths.specific_dir + self.fam_name) 88 self.p = AutoPaths(self.base_dir, self.all_paths) 89 90 @property_cached 91 def hmm_db(self): 92 """Create an HMM 'database' with only one profile in it.""" 93 hmm_db = self.p.model 94 hmm_db.seqtype = 'hmm_prot' 95 if not hmm_db.exists: 96 print(sh.hmmfetch('-o', hmm_db, pfam.hmm_db, self.fam_name)) 97 assert hmm_db 98 return hmm_db 99 100 @property_cached 101 def fasta(self): 102 """Make a fasta file with all uniprot proteins that are related to 103 this family.""" 104 fasta = FASTA(self.p.proteins) 105 if not fasta.exists: 106 fasta.create() 107 for seq in pfam.fasta: 108 if self.fam_name in seq.description: fasta.add_seq(seq) 109 fasta.close() 110 assert fasta 111 # Return # 112 return fasta 113 114 @property_cached 115 def subsampled(self): 116 subsampled = FASTA(self.p.subsampled) 117 if not subsampled.exists: 118 self.fasta.subsample(down_to=30, new_path=subsampled) 119 self.add_taxonomy(subsampled) 120 return subsampled 121 122 def add_taxonomy(self, fasta): 123 """Add taxonomic information to the fastas file""" 124 ids = [seq.description for seq in fasta] 125 accesions = [seq.description.split()[1] for seq in fasta] 126 taxonomies = map(self.uniprot_acc_to_taxonmy, accesions) 127 naming_dict = {ids[i]: ids[i] + taxonomies[i] for i in range(len(ids))} 128 fasta.rename_sequences(naming_dict, in_place=True) 129 130 def uniprot_acc_to_taxonmy(self, accesion): 131 """From one uniprot ID to taxonomy""" 132 from bioservices import UniProt 133 u = UniProt() 134 data = u.search(accesion, frmt="xml") 135 from bs4 import BeautifulSoup 136 soup = BeautifulSoup(data, "html.parser") 137 return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
24class Pfam(Database): 25 """The Pfam database is a large collection of protein families, 26 each represented by multiple sequence alignments and HMMs. 27 28 http://pfam.xfam.org 29 30 Pfam-A is the manually curated portion of the database that 31 contains over 16,000 entries. An automatically generated supplement 32 is provided called Pfam-B. Pfam-B was discontinued. 33 34 To install: 35 from seqsearch.databases.pfam import pfam 36 pfam.download() 37 pfam.ungzip() 38 39 It will put it in ~/databases/pfam 40 """ 41 42 all_paths = """ 43 /raw/ 44 /unzipped/Pfam-A.hmm 45 /unzipped/Pfam-A.fasta 46 /unzipped/Pfam-A.seed 47 /specific/ 48 """ 49 50 short_name = "pfam" 51 ftp_url = "ftp.ebi.ac.uk" 52 ftp_dir = "/pub/databases/Pfam/current_release/" 53 files = ("Pfam-A.hmm.gz", "Pfam-A.fasta.gz", "Pfam-A.seed.gz") 54 55 @property_cached 56 def hmm_db(self): 57 hmm_db = self.autopaths.hmm 58 hmm_db.seq_type = 'hmm_prot' 59 return hmm_db 60 61 @property_cached 62 def fasta(self): 63 fasta = FASTA(self.autopaths.fasta) 64 return fasta 65 66 @property_cached 67 def seeds(self): 68 seeds = FASTA(self.autopaths.seed) 69 return seeds
The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and HMMs.
Pfam-A is the manually curated portion of the database that contains over 16,000 entries. An automatically generated supplement is provided called Pfam-B. Pfam-B was discontinued.
To install: from seqsearch.databases.pfam import pfam pfam.download() pfam.ungzip()
It will put it in ~/databases/pfam
class
SpecificFamily:
75class SpecificFamily(object): 76 """When you are interested in having an HMM 'database' with only 77 one specific Pfam in it. As well as the associated proteins that 78 are part of it.""" 79 80 all_paths = """ 81 /model.hmm 82 /proteins.fasta 83 /subsampled.fasta 84 """ 85 86 def __init__(self, fam_name): 87 self.fam_name = fam_name 88 self.base_dir = DirectoryPath(pfam.autopaths.specific_dir + self.fam_name) 89 self.p = AutoPaths(self.base_dir, self.all_paths) 90 91 @property_cached 92 def hmm_db(self): 93 """Create an HMM 'database' with only one profile in it.""" 94 hmm_db = self.p.model 95 hmm_db.seqtype = 'hmm_prot' 96 if not hmm_db.exists: 97 print(sh.hmmfetch('-o', hmm_db, pfam.hmm_db, self.fam_name)) 98 assert hmm_db 99 return hmm_db 100 101 @property_cached 102 def fasta(self): 103 """Make a fasta file with all uniprot proteins that are related to 104 this family.""" 105 fasta = FASTA(self.p.proteins) 106 if not fasta.exists: 107 fasta.create() 108 for seq in pfam.fasta: 109 if self.fam_name in seq.description: fasta.add_seq(seq) 110 fasta.close() 111 assert fasta 112 # Return # 113 return fasta 114 115 @property_cached 116 def subsampled(self): 117 subsampled = FASTA(self.p.subsampled) 118 if not subsampled.exists: 119 self.fasta.subsample(down_to=30, new_path=subsampled) 120 self.add_taxonomy(subsampled) 121 return subsampled 122 123 def add_taxonomy(self, fasta): 124 """Add taxonomic information to the fastas file""" 125 ids = [seq.description for seq in fasta] 126 accesions = [seq.description.split()[1] for seq in fasta] 127 taxonomies = map(self.uniprot_acc_to_taxonmy, accesions) 128 naming_dict = {ids[i]: ids[i] + taxonomies[i] for i in range(len(ids))} 129 fasta.rename_sequences(naming_dict, in_place=True) 130 131 def uniprot_acc_to_taxonmy(self, accesion): 132 """From one uniprot ID to taxonomy""" 133 from bioservices import UniProt 134 u = UniProt() 135 data = u.search(accesion, frmt="xml") 136 from bs4 import BeautifulSoup 137 soup = BeautifulSoup(data, "html.parser") 138 return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
When you are interested in having an HMM 'database' with only one specific Pfam in it. As well as the associated proteins that are part of it.
def
add_taxonomy(self, fasta):
123 def add_taxonomy(self, fasta): 124 """Add taxonomic information to the fastas file""" 125 ids = [seq.description for seq in fasta] 126 accesions = [seq.description.split()[1] for seq in fasta] 127 taxonomies = map(self.uniprot_acc_to_taxonmy, accesions) 128 naming_dict = {ids[i]: ids[i] + taxonomies[i] for i in range(len(ids))} 129 fasta.rename_sequences(naming_dict, in_place=True)
Add taxonomic information to the fastas file
def
uniprot_acc_to_taxonmy(self, accesion):
131 def uniprot_acc_to_taxonmy(self, accesion): 132 """From one uniprot ID to taxonomy""" 133 from bioservices import UniProt 134 u = UniProt() 135 data = u.search(accesion, frmt="xml") 136 from bs4 import BeautifulSoup 137 soup = BeautifulSoup(data, "html.parser") 138 return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
From one uniprot ID to taxonomy