"""
    Handle RNA structure data files.
    - Author: Vincent Therrien (therrien.vincent.2@courrier.uqam.ca)
    - Affiliation: Département d'informatique, UQÀM
    - File creation date: June 2023
    - License: MIT
"""
from diurnal.utils import log
[docs]def read_ct_file(path: str) -> tuple:
    """
    Read a CT (Connect table) file and return its information.
    Args:
        path (str): File path of the CT file.
    Returns (tuple):
        The returned tuple contains the following data:
        - RNA molecule title.
        - Primary structure (i.e. a list of 'A', 'C', 'G', and 'U').
        - Pairings (i.e. a list of integers indicating the index of the
            paired based, with `-1` indicating unpaired bases).
    """
    bases = []
    pairings = []
    with open(path) as f:
        header = f.readline()
        if header[0] == ">":
            length = int(header.split()[2])
        else:
            length = int(header.split()[0])
        title = " ".join(header.split()[1:])
        f.seek(0)
        i = 0
        for _, line in enumerate(f):
            # deal w/ header for nth structure
            if i == 0:
                if header[0] == ">":
                    length = int(line.split()[2])
                else:
                    length = int(header.split()[0])
                title = " ".join(line.split()[1:])
                i += 1
                continue
            bn, b, _, _, p, _ = line.split()
            if int(bn) != i:
                #  log.info(f"Skipping CT index {i} in the file `{path}`.")
                bases.append("N")
                pairings.append(-1)
                i += 1
            else:
                bases.append(b)
                pairings.append(int(p) - 1)
            if i == length:
                break
            i += 1
    return title, bases, pairings 
[docs]def read_ct_file_length(path: str) -> int:
    """Get the size of the sequence written in a CT file.
    Args:
        path (str): File path of the CT file.
    Returns (int): Number of bases in the sequence.
    """
    length = 0
    with open(path) as f:
        header = f.readline()
        if header[0] == ">":
            length = int(header.split()[2])
        else:
            length = int(header.split()[0])
    return length