Rosalind: Computing GC Content

Python

Reading FASTA Files

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from pathlib import Path
from typing import List, Tuple, Generator, Mapping


def join_sequences(sequences: List[str]) -> str:
    """
    Joins a list of sequences into a single string by removing leading and trailing whitespace.

    Args:
        sequences (List[str]): A list of sequences as strings.

    Returns:
        str: The concatenated string of sequences with leading and trailing whitespace removed.
    """
    return "".join(map(lambda x: x.strip(), sequences))


def partition(fasta: List[str]) -> Generator[Tuple[str, str], None, None]:
    """
    Partitions a FASTA file into sequence ID and corresponding sequence pairs.

    Args:
        fasta (List[str]): A list of lines from a FASTA file.

    Yields:
        Tuple[str, str]: A tuple containing sequence ID and its corresponding sequence.

    Examples:
        >>> fasta = [
        ...     ">seq1",
        ...     "ATCG",
        ...     "ATC",
        ...     ">seq2",
        ...     "GTCA",
        ...     "GTC",
        ... ]
        >>> for sequence_id, sequence in partition(fasta):
        ...     print(sequence_id, sequence)
        seq1 ATCGATC
        seq2 GTCAGTC
    """
    index, sequence_id = 0, ""
    for i, line in enumerate(fasta):
        if line.startswith(">"):
            if i == 0:
                sequence_id = line.strip().lstrip(">")
                continue
            else:
                yield (sequence_id, join_sequences(fasta[index + 1:i]))
                index, sequence_id = i, line.strip().lstrip(">")
        elif (i + 1) == len(fasta):
            yield (sequence_id, join_sequences(fasta[index + 1:]))
        else:
            continue
    

def read_fasta(path: Path) -> Mapping[str, str]:
    """
    Reads a FASTA file and returns a dictionary of sequence IDs and their corresponding sequences.

    Args:
        path (Path): The path to the FASTA file.

    Returns:
        Mapping[str, str]: A dictionary mapping sequence IDs to their sequences.

    Examples:
        >>> path = Path("sequences.fasta")
        >>> sequences = read_fasta(path)
        >>> print(sequences)
        {"seq1": "ATCGATC", "seq2": "GTCAGTC"}
    """
    with path.open(mode="r") as f:
        sequences = partition(f.readlines())
    return dict(sequences)

Solution 1

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
def gc_content(sequence):
    g, c = sequence.count("G"), sequence.count("C")
    return (g + c) / len(sequence)


def highest_gc_content(sequences):
    sequence_header = ""
    highest_gc = 0

    for header, sequence in sequences.items():
        gc = gc_content(sequence)
        if gc > highest_gc:
            highest_gc = gc
            sequence_header = header
    return sequence_header, highest_gc

Solution 2

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
from collections import Counter


def gc_content(sequence):
    counts = Counter(sequence)
    return (counts["G"] + counts["C"]) / len(sequence) 


def highest_gc_content(sequences):
    sequence_header = ""
    highest_gc = 0

    for header, sequence in sequences.items():
        gc = gc_content(sequence)
        if gc > highest_gc:
            highest_gc = gc
            sequence_header = header
    return sequence_header, highest_gc
0%