Source code for boolforge.bio_models

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
r"""
This module provides functionality for retrieving, parsing, and loading
biological Boolean network models from public online repositories.

The :mod:`boolforge.bio_models` module allows users to programmatically access
and import published Boolean and logical gene regulatory network models from
GitHub repositories such as:

- expert-curated (ckadelka): manually curated models from the
  Design Principles of Gene Regulatory Networks repository.

- pystablemotifs (jcrozum): models accompanying the PyStableMotifs library.

- biodivine (sybila): models from the Sybila Biodivine Boolean Models repository.

Functions are provided to:

- Recursively list and download files from GitHub folders using the REST API.
- Fetch raw text or byte content from remote sources.
- Parse Boolean network models into BooleanNetwork objects.
- Batch-download and convert all models from supported repositories.

This module is intended to facilitate reproducible research by providing
direct access to real-world Boolean GRN models for simulation, comparison,
and benchmarking.

Example
-------
>>> from boolforge import bio_models
>>> result = bio_models.get_bio_models_from_repository('expert-curated (ckadelka)')
>>> len(result['BooleanNetworks'])
122
>>> result['BooleanNetworks'][0].variables[:5]
['GeneA', 'GeneB', 'GeneC', 'GeneD', 'GeneE']
"""

import requests
import pickle
import io

try:
    from boolforge.boolean_network import BooleanNetwork
except ModuleNotFoundError:
    from boolean_network import BooleanNetwork

def _get_content_in_remote_folder(
    url: str,
    file_names: list,
    file_download_urls: list
) -> None:
    """
    Recursively collect file names and raw download URLs from a GitHub folder.

    Parameters
    ----------
    url : str
        GitHub API URL pointing to a repository folder.
    file_names : list
        List that will be populated with discovered file names.
    file_download_urls : list
        List that will be populated with corresponding raw download URLs.

    Returns
    -------
    None
    """
    import logging

    folder = requests.get(url)
    folder.raise_for_status()
    folder_json = folder.json()

    for item in folder_json:
        if item['size'] > 0 and item['download_url'] is not None:
            file_names.append(item['name'])
            file_download_urls.append(item['download_url'])
        else:
            try:
                _get_content_in_remote_folder(
                    item['url'], file_names, file_download_urls
                )
            except Exception as e:
                logging.warning(
                    "Failed to access subfolder at %s: %s",
                    item.get('url', '<unknown>'),
                    e,
                )


[docs]
def get_content_in_remote_folder(url: str) -> tuple:
    """
    Retrieve file names and raw download URLs from a GitHub repository folder.

    Parameters
    ----------
    url : str
        GitHub API URL pointing to a repository folder.

    Returns
    -------
    tuple[list[str], list[str]]
        A tuple ``(file_names, file_download_urls)``, where:

        - ``file_names`` contains the names of discovered files.
        - ``file_download_urls`` contains corresponding raw download URLs.
    """
    file_names = []
    file_download_urls = []
    _get_content_in_remote_folder(url, file_names, file_download_urls)
    return file_names, file_download_urls




[docs]
def fetch_file(download_url: str) -> str:
    """
    Download raw text content from a remote file.

    Parameters
    ----------
    download_url : str
        Direct download URL to the file.

    Returns
    -------
    str
        File content as plain text.
    """
    r = requests.get(download_url)
    r.raise_for_status()
    return r.text




[docs]
def fetch_file_bytes(download_url: str) -> bytes:
    """
    Download raw binary content from a remote file.

    Parameters
    ----------
    download_url : str
        Direct download URL to the file.

    Returns
    -------
    bytes
        File content as raw bytes.
    """
    r = requests.get(download_url)
    r.raise_for_status()
    return r.content




[docs]
def load_model(
    download_url: str,
    max_degree: int = 24,
    possible_separators: list = ['* =', '*=', '=', ','],
    original_not: str = 'NOT',
    original_and: str = 'AND',
    original_or: str = 'OR',
    IGNORE_FIRST_LINE: bool = False
) -> BooleanNetwork:
    """
    Load and parse a Boolean network model from a remote text file.

    Parameters
    ----------
    download_url : str
        Direct download URL to the model file.
    max_degree : int, optional
        Maximum allowed in-degree for nodes (default: 24).
    possible_separators : list[str], optional
        Possible assignment separators used in the model file.
    original_not : str, optional
        Logical negation operator used in the model file.
    original_and : str, optional
        Logical AND operator used in the model file.
    original_or : str, optional
        Logical OR operator used in the model file.
    IGNORE_FIRST_LINE : bool, optional
        If True, skip the first line of the file (default: False).

    Returns
    -------
    BooleanNetwork
        Parsed Boolean network.

    Raises
    ------
    ValueError
        If the model cannot be parsed.
    """
    string = fetch_file(download_url)
    if IGNORE_FIRST_LINE:
        string = string[string.index('\n') + 1:]

    try:
        bn = BooleanNetwork.from_string(
            string,
            possible_separators,
            max_degree,
            original_not,
            original_and,
            original_or,
        )
    except Exception as e:
        raise ValueError(
            f"Failed to parse Boolean network model from {download_url}"
        ) from e

    return bn



[docs]
def get_bio_models_from_repository(
    repository: str = 'expert-curated (ckadelka)',
    download_urls_pystablemotifs: list[str] | None = None,
) -> dict:
    """
    Load Boolean network models from selected online repositories.

    This function downloads, parses, and constructs Boolean network models
    from several curated online repositories. Models that cannot be parsed
    are skipped and recorded separately.

    Parameters
    ----------
    repository : str, optional
        Identifier of the source repository. Supported values are:

        - 'expert-curated (ckadelka)' (default)
        - 'pystablemotifs (jcrozum)'
        - 'biodivine (sybila)'

    download_urls_pystablemotifs : list[str] or None, optional
        Optional list of direct download URLs for PyStableMotifs models.
        If provided, these URLs are used instead of querying the GitHub API (faster).
        If None (default), model URLs are fetched dynamically from GitHub.

    Returns
    -------
    dict
        Dictionary with the following keys:

        - 'BooleanNetworks' : list[BooleanNetwork]
            List of successfully parsed Boolean network models.

        - 'SuccessfulDownloadURLs' : list[str]
            URLs corresponding to models that were successfully loaded.

        - 'FailedDownloadURLs' : list[str]
            URLs corresponding to models that could not be parsed or loaded.
    """
    repositories = [
        'expert-curated (ckadelka)',
        'pystablemotifs (jcrozum)',
        'biodivine (sybila)',
    ]

    bns = []
    successful_download_urls = []
    failed_download_urls = []

    if repository == 'expert-curated (ckadelka)':
        download_url_base = (
            'https://raw.githubusercontent.com/ckadelka/'
            'DesignPrinciplesGeneNetworks/main/'
            'update_rules_122_models_Kadelka_SciAdv/'
        )
        download_url = download_url_base + 'all_txt_files.csv'
        csv = fetch_file(download_url)

        for line in csv.splitlines():
            download_url = download_url_base + line
            if '.txt' in download_url:
                try:
                    if 'tabular' in download_url:
                        F, I, var, constants = pickle.load(
                            io.BytesIO(fetch_file_bytes(download_url))
                        )
                        for i in range(len(constants)):
                            F.append([0, 1])
                            I.append([len(var) + i])
                        bn = BooleanNetwork(F, I, var + constants)
                    else:
                        bn = load_model(
                            download_url,
                            original_and=" AND ",
                            original_or=" OR ",
                            original_not=" NOT ",
                        )

                    successful_download_urls.append(download_url)
                    bns.append(bn)

                except Exception:
                    failed_download_urls.append(download_url)

    elif repository == 'pystablemotifs (jcrozum)':
        if download_urls_pystablemotifs is None:
            url = "https://api.github.com/repos/jcrozum/pystablemotifs/contents/models"
            _, download_urls = get_content_in_remote_folder(url)
        else:
            download_urls = download_urls_pystablemotifs

        for download_url in download_urls:
            if '.txt' in download_url:
                try:
                    bn = load_model(
                        download_url,
                        possible_separators=['*    =', '*   =', '*  =', '* =', '*='],
                        original_and=[" and ", "&"],
                        original_or=[" or ", "|"],
                        original_not=[" not ", " !"],
                    )
                    successful_download_urls.append(download_url)
                    bns.append(bn)
                except Exception:
                    failed_download_urls.append(download_url)

    elif repository == 'biodivine (sybila)':
        download_url_base = (
            'https://raw.githubusercontent.com/sybila/'
            'biodivine-boolean-models/main/models/'
        )
        download_url = download_url_base + 'summary.csv'
        csv = fetch_file(download_url)

        for line in csv.splitlines():
            try:
                ID, name, variables, inputs, regulations = line.split(', ')
                download_url = (
                    download_url_base
                    + '[id-%s]__[var-%s]__[in-%s]__[%s]/model.bnet'
                    % (ID, variables, inputs, name)
                )
                bn = load_model(
                    download_url,
                    original_and=" & ",
                    original_or=" | ",
                    original_not="!",
                    IGNORE_FIRST_LINE=True,
                )
                successful_download_urls.append(download_url)
                bns.append(bn)
            except Exception:
                failed_download_urls.append(download_url)

    else:
        raise ValueError(
            "repository must be one of:\n - " + "\n - ".join(repositories)
        )

    return {
        "BooleanNetworks": bns,
        "SuccessfulDownloadURLs": successful_download_urls,
        "FailedDownloadURLs": failed_download_urls,
    }