Rosalind: Consensus and Profile

Python

Read FASTA
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from pathlib import Path
from collections import Counter
from typing import List, Tuple, Generator, Mapping


def join_sequences(sequences: List[str]) -> str:
    """
    Joins a list of sequences into a single string by removing leading and trailing whitespace.

    Args:
        sequences (List[str]): A list of sequences as strings.

    Returns:
        str: The concatenated string of sequences with leading and trailing whitespace removed.
    """
    return "".join(map(lambda x: x.strip(), sequences))


def partition(fasta: List[str]) -> Generator[Tuple[str, str], None, None]:
    """
    Partitions a FASTA file into sequence ID and corresponding sequence pairs.

    Args:
        fasta (List[str]): A list of lines from a FASTA file.

    Yields:
        Tuple[str, str]: A tuple containing sequence ID and its corresponding sequence.

    Examples:
        >>> fasta = [
        ...     ">seq1",
        ...     "ATCG",
        ...     "ATC",
        ...     ">seq2",
        ...     "GTCA",
        ...     "GTC",
        ... ]
        >>> for sequence_id, sequence in partition(fasta):
        ...     print(sequence_id, sequence)
        seq1 ATCGATC
        seq2 GTCAGTC
    """
    index, sequence_id = 0, ""
    for i, line in enumerate(fasta):
        if line.startswith(">"):
            if i == 0:
                sequence_id = line.strip().lstrip(">")
                continue
            else:
                yield (sequence_id, join_sequences(fasta[index + 1:i]))
                index, sequence_id = i, line.strip().lstrip(">")
        elif (i + 1) == len(fasta):
            yield (sequence_id, join_sequences(fasta[index + 1:]))
        else:
            continue
    

def read_fasta(path: Path) -> Mapping[str, str]:
    """
    Reads a FASTA file and returns a dictionary of sequence IDs and their corresponding sequences.

    Args:
        path (Path): The path to the FASTA file.

    Returns:
        Mapping[str, str]: A dictionary mapping sequence IDs to their sequences.

    Examples:
        >>> path = Path("sequences.fasta")
        >>> sequences = read_fasta(path)
        >>> print(sequences)
        {"seq1": "ATCGATC", "seq2": "GTCAGTC"}
    """
    with path.open(mode="r") as f:
        sequences = partition(f.readlines())
    return dict(sequences)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def transpose(sequences: List[str]) -> Generator[str, None, None]:
    """
    Transposes a list of sequences.

    Args:
        sequences (List[str]): A list of sequences.

    Yields:
        str: Transposed sequences.

    Examples:
        >>> sequences = ["ATCG", "GTCA"]
        >>> for transposed in transpose(sequences):
        ...     print(transposed)
        AG
        TC
        CG
        A
    """
    for _transpose in zip(*sequences):
        yield _transpose


def nucleotide_counter(sequences: List[str]) -> Generator[Counter, None, None]:
    """
    Counts the occurrences of nucleotides in a list of sequences.

    Args:
        sequences (List[str]): A list of sequences.

    Yields:
        Counter: A counter object containing the count of nucleotides.

    Examples:
        >>> sequences = ["ATCG", "GTCA"]
        >>> counters = nucleotide_counter(sequences)
        >>> for counter in counters:
        ...     print(counter)
        Counter({"A": 1, "G": 1})
        Counter({"T": 1, "C": 1})
        Counter({"C": 1, "G": 1})
        Counter({"A": 1})
    """
    return map(Counter, transpose(sequences))


def consensus(sequences: List[str]) -> str:
    """
    Finds the consensus sequence and count matrix from a list of sequences.

    Args:
        sequences (List[str]): A list of sequences.

    Returns:
        str: The consensus sequence.
        List[Tuple[int, int, int, int]]: The count matrix containing the occurrences of nucleotides.

    Examples:
        >>> sequences = ["ATCG", "GTCA"]
        >>> consensus_seq, count_matrix = consensus(sequences)
        >>> print(consensus_seq)
        ATCG
        >>> print(count_matrix)
        [(1, 0, 0, 0), (0, 0, 1, 1), (0, 1, 0, 0), (0, 0, 1, 1)]
    """
    def _consensus(counters) -> Generator[str, None, None]:
        for count in counters:
            yield count.most_common(1)[0][0]

    def _count_matrix(counters: List[Counter]) -> Generator[Tuple[int, int, int, int], None, None]:
        for counter in counters:
            a = counter.get("A", 0)
            c = counter.get("C", 0)
            g = counter.get("G", 0)
            t = counter.get("T", 0)
            yield (a, c, g, t)

    counters = tuple(nucleotide_counter(sequences))
    consensus_sequence = "".join(_consensus(counters))
    count_matrix = list(transpose(_count_matrix(counters)))
    return consensus_sequence, count_matrix
0%