1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
from pathlib import Path
from typing import List, Tuple, Generator, Mapping
def join_sequences(sequences: List[str]) -> str:
"""
Joins a list of sequences into a single string by removing leading and trailing whitespace.
Args:
sequences (List[str]): A list of sequences as strings.
Returns:
str: The concatenated string of sequences with leading and trailing whitespace removed.
"""
return "".join(map(lambda x: x.strip(), sequences))
def partition(fasta: List[str]) -> Generator[Tuple[str, str], None, None]:
"""
Partitions a FASTA file into sequence ID and corresponding sequence pairs.
Args:
fasta (List[str]): A list of lines from a FASTA file.
Yields:
Tuple[str, str]: A tuple containing sequence ID and its corresponding sequence.
Examples:
>>> fasta = [
... ">seq1",
... "ATCG",
... "ATC",
... ">seq2",
... "GTCA",
... "GTC",
... ]
>>> for sequence_id, sequence in partition(fasta):
... print(sequence_id, sequence)
seq1 ATCGATC
seq2 GTCAGTC
"""
index, sequence_id = 0, ""
for i, line in enumerate(fasta):
if line.startswith(">"):
if i == 0:
sequence_id = line.strip().lstrip(">")
continue
else:
yield (sequence_id, join_sequences(fasta[index + 1:i]))
index, sequence_id = i, line.strip().lstrip(">")
elif (i + 1) == len(fasta):
yield (sequence_id, join_sequences(fasta[index + 1:]))
else:
continue
def read_fasta(path: Path) -> Mapping[str, str]:
"""
Reads a FASTA file and returns a dictionary of sequence IDs and their corresponding sequences.
Args:
path (Path): The path to the FASTA file.
Returns:
Mapping[str, str]: A dictionary mapping sequence IDs to their sequences.
Examples:
>>> path = Path("sequences.fasta")
>>> sequences = read_fasta(path)
>>> print(sequences)
{"seq1": "ATCGATC", "seq2": "GTCAGTC"}
"""
with path.open(mode="r") as f:
sequences = partition(f.readlines())
return dict(sequences)
|