seqsearch.databases.pfam

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4"""
  5Written by Lucas Sinclair.
  6MIT Licensed.
  7Contact at www.sinclair.bio
  8"""
  9
 10# Built-in modules #
 11
 12# First party modules #
 13from seqsearch.databases import Database
 14from autopaths.auto_paths import AutoPaths
 15from autopaths.dir_path   import DirectoryPath
 16from plumbing.cache import property_cached
 17from fasta import FASTA
 18
 19# Module for launching shell commands #
 20from seqsearch import sh
 21
 22###############################################################################
 23class Pfam(Database):
 24    """The Pfam database is a large collection of protein families,
 25    each represented by multiple sequence alignments and HMMs.
 26
 27    http://pfam.xfam.org
 28
 29    Pfam-A is the manually curated portion of the database that
 30    contains over 16,000 entries. An automatically generated supplement
 31    is provided called Pfam-B. Pfam-B was discontinued.
 32
 33    To install:
 34        from seqsearch.databases.pfam import pfam
 35        pfam.download()
 36        pfam.ungzip()
 37
 38    It will put it in ~/databases/pfam
 39    """
 40
 41    all_paths = """
 42    /raw/
 43    /unzipped/Pfam-A.hmm
 44    /unzipped/Pfam-A.fasta
 45    /unzipped/Pfam-A.seed
 46    /specific/
 47    """
 48
 49    short_name = "pfam"
 50    ftp_url    = "ftp.ebi.ac.uk"
 51    ftp_dir    = "/pub/databases/Pfam/current_release/"
 52    files      = ("Pfam-A.hmm.gz", "Pfam-A.fasta.gz", "Pfam-A.seed.gz")
 53
 54    @property_cached
 55    def hmm_db(self):
 56        hmm_db = self.autopaths.hmm
 57        hmm_db.seq_type = 'hmm_prot'
 58        return hmm_db
 59
 60    @property_cached
 61    def fasta(self):
 62        fasta = FASTA(self.autopaths.fasta)
 63        return fasta
 64
 65    @property_cached
 66    def seeds(self):
 67        seeds = FASTA(self.autopaths.seed)
 68        return seeds
 69
 70###############################################################################
 71pfam = Pfam("hmm_prot")
 72
 73###############################################################################
 74class SpecificFamily(object):
 75    """When you are interested in having an HMM 'database' with only
 76    one specific Pfam in it. As well as the associated proteins that
 77    are part of it."""
 78
 79    all_paths = """
 80    /model.hmm
 81    /proteins.fasta
 82    /subsampled.fasta
 83    """
 84
 85    def __init__(self, fam_name):
 86        self.fam_name = fam_name
 87        self.base_dir = DirectoryPath(pfam.autopaths.specific_dir + self.fam_name)
 88        self.p        = AutoPaths(self.base_dir, self.all_paths)
 89
 90    @property_cached
 91    def hmm_db(self):
 92        """Create an HMM 'database' with only one profile in it."""
 93        hmm_db = self.p.model
 94        hmm_db.seqtype = 'hmm_prot'
 95        if not hmm_db.exists:
 96            print(sh.hmmfetch('-o', hmm_db, pfam.hmm_db, self.fam_name))
 97            assert hmm_db
 98        return hmm_db
 99
100    @property_cached
101    def fasta(self):
102        """Make a fasta file with all uniprot proteins that are related to
103        this family."""
104        fasta = FASTA(self.p.proteins)
105        if not fasta.exists:
106            fasta.create()
107            for seq in pfam.fasta:
108                if self.fam_name in seq.description: fasta.add_seq(seq)
109            fasta.close()
110            assert fasta
111        # Return #
112        return fasta
113
114    @property_cached
115    def subsampled(self):
116        subsampled = FASTA(self.p.subsampled)
117        if not subsampled.exists:
118            self.fasta.subsample(down_to=30, new_path=subsampled)
119            self.add_taxonomy(subsampled)
120        return subsampled
121
122    def add_taxonomy(self, fasta):
123        """Add taxonomic information to the fastas file"""
124        ids        = [seq.description for seq in fasta]
125        accesions  = [seq.description.split()[1] for seq in fasta]
126        taxonomies = map(self.uniprot_acc_to_taxonmy, accesions)
127        naming_dict  = {ids[i]: ids[i] + taxonomies[i] for i in range(len(ids))}
128        fasta.rename_sequences(naming_dict, in_place=True)
129
130    def uniprot_acc_to_taxonmy(self, accesion):
131        """From one uniprot ID to taxonomy"""
132        from bioservices import UniProt
133        u = UniProt()
134        data = u.search(accesion, frmt="xml")
135        from bs4 import BeautifulSoup
136        soup = BeautifulSoup(data, "html.parser")
137        return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
class Pfam(seqsearch.databases.Database):
24class Pfam(Database):
25    """The Pfam database is a large collection of protein families,
26    each represented by multiple sequence alignments and HMMs.
27
28    http://pfam.xfam.org
29
30    Pfam-A is the manually curated portion of the database that
31    contains over 16,000 entries. An automatically generated supplement
32    is provided called Pfam-B. Pfam-B was discontinued.
33
34    To install:
35        from seqsearch.databases.pfam import pfam
36        pfam.download()
37        pfam.ungzip()
38
39    It will put it in ~/databases/pfam
40    """
41
42    all_paths = """
43    /raw/
44    /unzipped/Pfam-A.hmm
45    /unzipped/Pfam-A.fasta
46    /unzipped/Pfam-A.seed
47    /specific/
48    """
49
50    short_name = "pfam"
51    ftp_url    = "ftp.ebi.ac.uk"
52    ftp_dir    = "/pub/databases/Pfam/current_release/"
53    files      = ("Pfam-A.hmm.gz", "Pfam-A.fasta.gz", "Pfam-A.seed.gz")
54
55    @property_cached
56    def hmm_db(self):
57        hmm_db = self.autopaths.hmm
58        hmm_db.seq_type = 'hmm_prot'
59        return hmm_db
60
61    @property_cached
62    def fasta(self):
63        fasta = FASTA(self.autopaths.fasta)
64        return fasta
65
66    @property_cached
67    def seeds(self):
68        seeds = FASTA(self.autopaths.seed)
69        return seeds

The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and HMMs.

http://pfam.xfam.org

Pfam-A is the manually curated portion of the database that contains over 16,000 entries. An automatically generated supplement is provided called Pfam-B. Pfam-B was discontinued.

To install: from seqsearch.databases.pfam import pfam pfam.download() pfam.ungzip()

It will put it in ~/databases/pfam

class SpecificFamily:
 75class SpecificFamily(object):
 76    """When you are interested in having an HMM 'database' with only
 77    one specific Pfam in it. As well as the associated proteins that
 78    are part of it."""
 79
 80    all_paths = """
 81    /model.hmm
 82    /proteins.fasta
 83    /subsampled.fasta
 84    """
 85
 86    def __init__(self, fam_name):
 87        self.fam_name = fam_name
 88        self.base_dir = DirectoryPath(pfam.autopaths.specific_dir + self.fam_name)
 89        self.p        = AutoPaths(self.base_dir, self.all_paths)
 90
 91    @property_cached
 92    def hmm_db(self):
 93        """Create an HMM 'database' with only one profile in it."""
 94        hmm_db = self.p.model
 95        hmm_db.seqtype = 'hmm_prot'
 96        if not hmm_db.exists:
 97            print(sh.hmmfetch('-o', hmm_db, pfam.hmm_db, self.fam_name))
 98            assert hmm_db
 99        return hmm_db
100
101    @property_cached
102    def fasta(self):
103        """Make a fasta file with all uniprot proteins that are related to
104        this family."""
105        fasta = FASTA(self.p.proteins)
106        if not fasta.exists:
107            fasta.create()
108            for seq in pfam.fasta:
109                if self.fam_name in seq.description: fasta.add_seq(seq)
110            fasta.close()
111            assert fasta
112        # Return #
113        return fasta
114
115    @property_cached
116    def subsampled(self):
117        subsampled = FASTA(self.p.subsampled)
118        if not subsampled.exists:
119            self.fasta.subsample(down_to=30, new_path=subsampled)
120            self.add_taxonomy(subsampled)
121        return subsampled
122
123    def add_taxonomy(self, fasta):
124        """Add taxonomic information to the fastas file"""
125        ids        = [seq.description for seq in fasta]
126        accesions  = [seq.description.split()[1] for seq in fasta]
127        taxonomies = map(self.uniprot_acc_to_taxonmy, accesions)
128        naming_dict  = {ids[i]: ids[i] + taxonomies[i] for i in range(len(ids))}
129        fasta.rename_sequences(naming_dict, in_place=True)
130
131    def uniprot_acc_to_taxonmy(self, accesion):
132        """From one uniprot ID to taxonomy"""
133        from bioservices import UniProt
134        u = UniProt()
135        data = u.search(accesion, frmt="xml")
136        from bs4 import BeautifulSoup
137        soup = BeautifulSoup(data, "html.parser")
138        return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'

When you are interested in having an HMM 'database' with only one specific Pfam in it. As well as the associated proteins that are part of it.

SpecificFamily(fam_name)
86    def __init__(self, fam_name):
87        self.fam_name = fam_name
88        self.base_dir = DirectoryPath(pfam.autopaths.specific_dir + self.fam_name)
89        self.p        = AutoPaths(self.base_dir, self.all_paths)
hmm_db

Create an HMM 'database' with only one profile in it.

fasta

Make a fasta file with all uniprot proteins that are related to this family.

def add_taxonomy(self, fasta):
123    def add_taxonomy(self, fasta):
124        """Add taxonomic information to the fastas file"""
125        ids        = [seq.description for seq in fasta]
126        accesions  = [seq.description.split()[1] for seq in fasta]
127        taxonomies = map(self.uniprot_acc_to_taxonmy, accesions)
128        naming_dict  = {ids[i]: ids[i] + taxonomies[i] for i in range(len(ids))}
129        fasta.rename_sequences(naming_dict, in_place=True)

Add taxonomic information to the fastas file

def uniprot_acc_to_taxonmy(self, accesion):
131    def uniprot_acc_to_taxonmy(self, accesion):
132        """From one uniprot ID to taxonomy"""
133        from bioservices import UniProt
134        u = UniProt()
135        data = u.search(accesion, frmt="xml")
136        from bs4 import BeautifulSoup
137        soup = BeautifulSoup(data, "html.parser")
138        return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'

From one uniprot ID to taxonomy