fasta.splitable
Written by Lucas Sinclair. MIT Licensed. Contact at www.sinclair.bio
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4""" 5Written by Lucas Sinclair. 6MIT Licensed. 7Contact at www.sinclair.bio 8""" 9 10# Built-in modules # 11import os, sys, math, shutil 12 13# Internal modules # 14from fasta import FASTA 15 16# Third party modules # 17 18############################################################################### 19class SplitableFASTA(FASTA): 20 """ 21 A FASTA file which you can split into chunks. Either you give the number 22 of parts you want to generate, or you can give a target size in bytes for 23 each part. 24 """ 25 26 def __init__(self, path, num_parts=None, part_size=None, base_dir=None): 27 # Basic # 28 self.path = path 29 # Directory # 30 if base_dir is None: self.base_dir = path + '.parts/' 31 else: self.base_dir = base_dir 32 # Num parts # 33 if num_parts is not None: self.num_parts = num_parts 34 # Special module # 35 import humanfriendly 36 # Evaluate size # 37 if part_size is not None: 38 self.bytes_target = humanfriendly.parse_size(part_size) 39 self.num_parts = int(math.ceil(self.count_bytes / self.bytes_target)) 40 # Make parts # 41 self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i 42 self.parts = [FASTA(self.make_name(i)) 43 for i in range(1, self.num_parts+1)] 44 # Give a number to each part # 45 for i, part in enumerate(self.parts): part.num = i 46 47 @property 48 def status(self): 49 """Has the splitting been done already?""" 50 if all(os.path.exists(p.path) for p in self.parts): return True 51 return False 52 53 def run(self): 54 # Clean up # 55 for i in range(1, sys.maxsize): 56 dir_path = self.base_dir + "%03d/" % i 57 if os.path.exists(dir_path): shutil.rmtree(dir_path) 58 else: break 59 # Case only one part # 60 if len(self.parts) == 1: 61 self.parts[0].directory.create(safe=True) 62 self.link_to(self.parts[0]) 63 return 64 # Compute number of sequences # 65 self.seqs_per_part = int(math.floor(self.count / self.num_parts)) 66 # Prepare # 67 for part in self.parts: part.create() 68 # Do the job # 69 seqs = self.parse() 70 for part in self.parts: 71 for i in range(self.seqs_per_part): 72 part.add_seq(seqs.next()) 73 # The final sequences go to the last part # 74 for seq in seqs: part.add_seq(seq) 75 # Clean up # 76 for part in self.parts: part.close()
20class SplitableFASTA(FASTA): 21 """ 22 A FASTA file which you can split into chunks. Either you give the number 23 of parts you want to generate, or you can give a target size in bytes for 24 each part. 25 """ 26 27 def __init__(self, path, num_parts=None, part_size=None, base_dir=None): 28 # Basic # 29 self.path = path 30 # Directory # 31 if base_dir is None: self.base_dir = path + '.parts/' 32 else: self.base_dir = base_dir 33 # Num parts # 34 if num_parts is not None: self.num_parts = num_parts 35 # Special module # 36 import humanfriendly 37 # Evaluate size # 38 if part_size is not None: 39 self.bytes_target = humanfriendly.parse_size(part_size) 40 self.num_parts = int(math.ceil(self.count_bytes / self.bytes_target)) 41 # Make parts # 42 self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i 43 self.parts = [FASTA(self.make_name(i)) 44 for i in range(1, self.num_parts+1)] 45 # Give a number to each part # 46 for i, part in enumerate(self.parts): part.num = i 47 48 @property 49 def status(self): 50 """Has the splitting been done already?""" 51 if all(os.path.exists(p.path) for p in self.parts): return True 52 return False 53 54 def run(self): 55 # Clean up # 56 for i in range(1, sys.maxsize): 57 dir_path = self.base_dir + "%03d/" % i 58 if os.path.exists(dir_path): shutil.rmtree(dir_path) 59 else: break 60 # Case only one part # 61 if len(self.parts) == 1: 62 self.parts[0].directory.create(safe=True) 63 self.link_to(self.parts[0]) 64 return 65 # Compute number of sequences # 66 self.seqs_per_part = int(math.floor(self.count / self.num_parts)) 67 # Prepare # 68 for part in self.parts: part.create() 69 # Do the job # 70 seqs = self.parse() 71 for part in self.parts: 72 for i in range(self.seqs_per_part): 73 part.add_seq(seqs.next()) 74 # The final sequences go to the last part # 75 for seq in seqs: part.add_seq(seq) 76 # Clean up # 77 for part in self.parts: part.close()
A FASTA file which you can split into chunks. Either you give the number of parts you want to generate, or you can give a target size in bytes for each part.
SplitableFASTA(path, *args, **kwargs)
59 def __new__(cls, path, *args, **kwargs): 60 """A Path object is in fact a string.""" 61 return str.__new__(cls, cls.clean_path(path))
A Path object is in fact a string.
def
run(self):
54 def run(self): 55 # Clean up # 56 for i in range(1, sys.maxsize): 57 dir_path = self.base_dir + "%03d/" % i 58 if os.path.exists(dir_path): shutil.rmtree(dir_path) 59 else: break 60 # Case only one part # 61 if len(self.parts) == 1: 62 self.parts[0].directory.create(safe=True) 63 self.link_to(self.parts[0]) 64 return 65 # Compute number of sequences # 66 self.seqs_per_part = int(math.floor(self.count / self.num_parts)) 67 # Prepare # 68 for part in self.parts: part.create() 69 # Do the job # 70 seqs = self.parse() 71 for part in self.parts: 72 for i in range(self.seqs_per_part): 73 part.add_seq(seqs.next()) 74 # The final sequences go to the last part # 75 for seq in seqs: part.add_seq(seq) 76 # Clean up # 77 for part in self.parts: part.close()
Inherited Members
- fasta.core.FASTA
- format
- ext
- buffer_size
- gzipped
- first
- count
- lengths
- lengths_counter
- open
- close
- parse
- progress
- create
- add
- add_seq
- add_str
- add_fasta
- add_fastas
- flush
- write
- compress
- compress_slow
- compress_fast
- ids
- get_id
- sequences
- sql
- length_by_id
- subsample
- rename_with_num
- rename_with_prefix
- rename_sequences
- extract_length
- extract_sequences
- remove_trailing_stars
- remove_duplicates
- convert_U_to_T
- align
- template_align
- index_bowtie
- index_samtools
- graphs
- parse_primers
- autopaths.file_path.FilePath
- prefix_path
- prefix
- name
- filename
- directory
- count_bytes
- size
- contents
- contents_utf8
- md5
- might_be_binary
- contains_binary
- magic_number
- lines
- read
- touch
- writelines
- remove
- copy
- execute
- replace_extension
- new_name_insert
- make_directory
- must_exist
- head
- pretty_head
- tail
- pretty_tail
- move_to
- rename
- gzip_to
- gzip_internal
- gzip_external
- gzip_pigz
- ungzip_to
- ungzip_internal
- ungzip_external
- zip_to
- unzip_to
- untar_to
- untargz_to
- untargz_to_internal
- untargz_to_external
- append
- prepend
- remove_line
- remove_first_line
- replace_line
- replace_word
- sed_replace
- autopaths.base_path.BasePath
- clean_path
- short_prefix
- extension
- escaped
- absolute_path
- physical_path
- relative_path
- rel_path_from
- exists
- is_symlink
- permissions
- mdate
- mdate_iso
- cdate
- cdate_iso
- unix_style
- wsl_style
- win_style
- with_tilda
- with_home
- link_from
- link_to
- symlinks_on_linux
- symlinks_on_windows
- hard_link_win_to
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format_map
- maketrans