Source code for diurnal.utils.rna_data

"""
    Handle RNA structure data files.

    - Author: Vincent Therrien (therrien.vincent.2@courrier.uqam.ca)
    - Affiliation: Département d'informatique, UQÀM
    - File creation date: June 2023
    - License: MIT
"""

from diurnal.utils import log


[docs]def read_ct_file(path: str) -> tuple: """ Read a CT (Connect table) file and return its information. Args: path (str): File path of the CT file. Returns (tuple): The returned tuple contains the following data: - RNA molecule title. - Primary structure (i.e. a list of 'A', 'C', 'G', and 'U'). - Pairings (i.e. a list of integers indicating the index of the paired based, with `-1` indicating unpaired bases). """ bases = [] pairings = [] with open(path) as f: header = f.readline() if header[0] == ">": length = int(header.split()[2]) else: length = int(header.split()[0]) title = " ".join(header.split()[1:]) f.seek(0) i = 0 for _, line in enumerate(f): # deal w/ header for nth structure if i == 0: if header[0] == ">": length = int(line.split()[2]) else: length = int(header.split()[0]) title = " ".join(line.split()[1:]) i += 1 continue bn, b, _, _, p, _ = line.split() if int(bn) != i: # log.info(f"Skipping CT index {i} in the file `{path}`.") bases.append("N") pairings.append(-1) i += 1 else: bases.append(b) pairings.append(int(p) - 1) if i == length: break i += 1 return title, bases, pairings
[docs]def read_ct_file_length(path: str) -> int: """Get the size of the sequence written in a CT file. Args: path (str): File path of the CT file. Returns (int): Number of bases in the sequence. """ length = 0 with open(path) as f: header = f.readline() if header[0] == ">": length = int(header.split()[2]) else: length = int(header.split()[0]) return length