fasta.splitable

Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio

 1#!/usr/bin/env python3
 2# -*- coding: utf-8 -*-
 3
 4"""
 5Written by Lucas Sinclair.
 6MIT Licensed.
 7Contact at www.sinclair.bio
 8"""
 9
10# Built-in modules #
11import os, sys, math, shutil
12
13# Internal modules #
14from fasta import FASTA
15
16# Third party modules #
17
18###############################################################################
19class SplitableFASTA(FASTA):
20    """
21    A FASTA file which you can split into chunks. Either you give the number
22    of parts you want to generate, or you can give a target size in bytes for
23    each part.
24    """
25
26    def __init__(self, path, num_parts=None, part_size=None, base_dir=None):
27        # Basic #
28        self.path = path
29        # Directory #
30        if base_dir is None: self.base_dir = path + '.parts/'
31        else:                self.base_dir = base_dir
32        # Num parts #
33        if num_parts is not None: self.num_parts = num_parts
34        # Special module #
35        import humanfriendly
36        # Evaluate size #
37        if part_size is not None:
38            self.bytes_target = humanfriendly.parse_size(part_size)
39            self.num_parts = int(math.ceil(self.count_bytes / self.bytes_target))
40        # Make parts #
41        self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i
42        self.parts = [FASTA(self.make_name(i))
43                      for i in range(1, self.num_parts+1)]
44        # Give a number to each part #
45        for i, part in enumerate(self.parts): part.num = i
46
47    @property
48    def status(self):
49        """Has the splitting been done already?"""
50        if all(os.path.exists(p.path) for p in self.parts): return True
51        return False
52
53    def run(self):
54        # Clean up #
55        for i in range(1, sys.maxsize):
56            dir_path = self.base_dir + "%03d/" % i
57            if os.path.exists(dir_path): shutil.rmtree(dir_path)
58            else: break
59        # Case only one part #
60        if len(self.parts) == 1:
61            self.parts[0].directory.create(safe=True)
62            self.link_to(self.parts[0])
63            return
64        # Compute number of sequences #
65        self.seqs_per_part = int(math.floor(self.count / self.num_parts))
66        # Prepare #
67        for part in self.parts: part.create()
68        # Do the job #
69        seqs = self.parse()
70        for part in self.parts:
71            for i in range(self.seqs_per_part):
72                part.add_seq(seqs.next())
73        # The final sequences go to the last part #
74        for seq in seqs: part.add_seq(seq)
75        # Clean up #
76        for part in self.parts: part.close()
class SplitableFASTA(fasta.core.FASTA):
20class SplitableFASTA(FASTA):
21    """
22    A FASTA file which you can split into chunks. Either you give the number
23    of parts you want to generate, or you can give a target size in bytes for
24    each part.
25    """
26
27    def __init__(self, path, num_parts=None, part_size=None, base_dir=None):
28        # Basic #
29        self.path = path
30        # Directory #
31        if base_dir is None: self.base_dir = path + '.parts/'
32        else:                self.base_dir = base_dir
33        # Num parts #
34        if num_parts is not None: self.num_parts = num_parts
35        # Special module #
36        import humanfriendly
37        # Evaluate size #
38        if part_size is not None:
39            self.bytes_target = humanfriendly.parse_size(part_size)
40            self.num_parts = int(math.ceil(self.count_bytes / self.bytes_target))
41        # Make parts #
42        self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i
43        self.parts = [FASTA(self.make_name(i))
44                      for i in range(1, self.num_parts+1)]
45        # Give a number to each part #
46        for i, part in enumerate(self.parts): part.num = i
47
48    @property
49    def status(self):
50        """Has the splitting been done already?"""
51        if all(os.path.exists(p.path) for p in self.parts): return True
52        return False
53
54    def run(self):
55        # Clean up #
56        for i in range(1, sys.maxsize):
57            dir_path = self.base_dir + "%03d/" % i
58            if os.path.exists(dir_path): shutil.rmtree(dir_path)
59            else: break
60        # Case only one part #
61        if len(self.parts) == 1:
62            self.parts[0].directory.create(safe=True)
63            self.link_to(self.parts[0])
64            return
65        # Compute number of sequences #
66        self.seqs_per_part = int(math.floor(self.count / self.num_parts))
67        # Prepare #
68        for part in self.parts: part.create()
69        # Do the job #
70        seqs = self.parse()
71        for part in self.parts:
72            for i in range(self.seqs_per_part):
73                part.add_seq(seqs.next())
74        # The final sequences go to the last part #
75        for seq in seqs: part.add_seq(seq)
76        # Clean up #
77        for part in self.parts: part.close()

A FASTA file which you can split into chunks. Either you give the number of parts you want to generate, or you can give a target size in bytes for each part.

SplitableFASTA(path, *args, **kwargs)
59    def __new__(cls, path, *args, **kwargs):
60        """A Path object is in fact a string."""
61        return str.__new__(cls, cls.clean_path(path))

A Path object is in fact a string.

path
make_name
parts
status

Has the splitting been done already?

def run(self):
54    def run(self):
55        # Clean up #
56        for i in range(1, sys.maxsize):
57            dir_path = self.base_dir + "%03d/" % i
58            if os.path.exists(dir_path): shutil.rmtree(dir_path)
59            else: break
60        # Case only one part #
61        if len(self.parts) == 1:
62            self.parts[0].directory.create(safe=True)
63            self.link_to(self.parts[0])
64            return
65        # Compute number of sequences #
66        self.seqs_per_part = int(math.floor(self.count / self.num_parts))
67        # Prepare #
68        for part in self.parts: part.create()
69        # Do the job #
70        seqs = self.parse()
71        for part in self.parts:
72            for i in range(self.seqs_per_part):
73                part.add_seq(seqs.next())
74        # The final sequences go to the last part #
75        for seq in seqs: part.add_seq(seq)
76        # Clean up #
77        for part in self.parts: part.close()
Inherited Members
fasta.core.FASTA
format
ext
buffer_size
gzipped
first
count
lengths
lengths_counter
open
close
parse
progress
create
add
add_seq
add_str
add_fasta
add_fastas
flush
write
compress
compress_slow
compress_fast
ids
get_id
sequences
sql
length_by_id
subsample
rename_with_num
rename_with_prefix
rename_sequences
extract_length
extract_sequences
remove_trailing_stars
remove_duplicates
convert_U_to_T
align
template_align
index_bowtie
index_samtools
graphs
parse_primers
autopaths.file_path.FilePath
prefix_path
prefix
name
filename
directory
count_bytes
size
contents
contents_utf8
md5
might_be_binary
contains_binary
magic_number
lines
read
touch
writelines
remove
copy
execute
replace_extension
new_name_insert
make_directory
must_exist
head
pretty_head
tail
pretty_tail
move_to
rename
gzip_to
gzip_internal
gzip_external
gzip_pigz
ungzip_to
ungzip_internal
ungzip_external
zip_to
unzip_to
untar_to
untargz_to
untargz_to_internal
untargz_to_external
append
prepend
remove_line
remove_first_line
replace_line
replace_word
sed_replace
autopaths.base_path.BasePath
clean_path
short_prefix
extension
escaped
absolute_path
physical_path
relative_path
rel_path_from
exists
permissions
mdate
mdate_iso
cdate
cdate_iso
unix_style
wsl_style
win_style
with_tilda
with_home
builtins.str
encode
replace
split
rsplit
join
capitalize
casefold
title
center
expandtabs
find
partition
index
ljust
lower
lstrip
rfind
rindex
rjust
rstrip
rpartition
splitlines
strip
swapcase
translate
upper
startswith
endswith
removeprefix
removesuffix
isascii
islower
isupper
istitle
isspace
isdecimal
isdigit
isnumeric
isalpha
isalnum
isidentifier
isprintable
zfill
format_map
maketrans