Skip to content

Descriptive Status

Nathaniel Maki edited this page Feb 21, 2020 · 3 revisions

Input Parsing

  • uses argparse, takes a config.yaml file in as an argument
  • allows for relative re-usability and avoids the pitfall of hardcoding parameters and file paths
  • optparse-inspired command-line parsing library that handles both optional and positional arguments
  • produces highly informative usage messages
  • supports parsers that dispatch to sub-parsers.
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
with open(args.filename) as file:
    config = yaml.full_load(file)

Access Configuration File

  • yaml or json file
  • contains path to DESeq output csv and
  • primary and secondary thresholds
  • ncbi organism ID
  • directly referenced to create initial pandas dataframe
input_path = config['DESeq_input']['path']
baseMean = config['baseMean']
    log2FoldChange = config['log2FoldChange']
    lfcSE = config['lfcSE']
    pvalue = config['pvalue']
    padj = config['padj']
    species_id = config['species']
    cs_threshold = config['cs_threshold']

Configuration File Details

# DESeq2 input file
# look into making paths to csv files non hard-coded
# original path: /Users/nmaki/Documents/GitHub/IDEA/tests/DESeq2_genes_wPC_DESeq2out.csv
DESeq_input:
  class: File
  path: ./tests/DESeq2_genes_wPC_DESeq2out.csv

# initial thresholds for DESeq2 candidates to send to String-db
# append initial
baseMean: 20.0 # lower bound
log2FoldChange: 1.0 # lower bound on abs val +-
lfcSE: 0.5 # upper bound
pvalue: 0.0001 # upper bound
padj: 0.05 # upper bound

# add secondary threshold for above parameters
# append secondary 
# if adjusted pvalue is not present, how will it be treated?
# round one, thrown out?
# second? Maybe not use?
t2_baseMean: 25.0
t2_log2FoldChange: 2.0
t2_lfcSE: 1.0
t2_pvalue: 0.001
t2_padj: 0.1
# output directory
# may be handled within python program as opposed to config file

# instead of hardcoding, pass taxonomy ID to main program
# create lookup table so that we don't have to reference ncbi for ID 
species: "7955"

# add combined_score threshold (search for higher score)
# default to 0.95
# on return from interaction_partners check if combined score is >= threshold
cs_threshold: 0.95

Initial Dataframe

  • using pandas, create a dataframe from csv file passed in through config file
df = pd.read_csv(input_path)

Thresholded Dataframe

  • secondary dataframe is created, comprised of methods of thresholding from the config file
  • utilizes df.loc to work only with columns defined in config file
df_threshold = df.loc[(df['baseMean'] > baseMean) 
                                    & (df['log2FoldChange'] < log2FoldChange)
                                    & (df['lfcSE'] < lfcSE)
                                    & (df['pvalue'] < pvalue)
                                    & (df['padj'] < padj)]

Thresholded Genes Dataframe

  • tertiary dataframe that is comprised of only the gene ids which passed initial thresholding
  • this is passed to STRING to acquire matching Protein ids.
my_genes = df_threshold['genes']

mapId Function

  • takes gene ids held in the thresholded dataframe as input
  • uses the STRING API to retrieve matching Ensembl protein ids
  • returns both Ensembl gene id sent and protein id, with ncbi taxonomy id as a prefix
def mapId():
        string_api_url = "https://string-db.org/api"
        output_format = "tsv"
        method = "get_string_ids"
        # configure parameters
        params = {

            "identifiers": "\r".join(my_genes),
            "species": species_id,
            "limit": 1,
            "echo_query": 1,
            "caller_identity": "mdibl.org"
        }
        request_url = "/".join([string_api_url, output_format, method])
        results = requests.post(request_url, data=params)
        for line in results.text.strip().split("\n"):
            l = line.split("\t")
            input_identifier, string_identifier = l[0], l[2]
            print("Input:", input_identifier, "STRING:",
                    string_identifier, sep="\t")

networkInteraction Function

  • uses STRING API to display experimentally significant protein interactions
  • for every protein in a given list, prints protein-protein interactions with high confidence experimental scores
  • retrieves only interactions between set of input proteins and closest interaction neighborhood
def networkInteraction():
        string_api_url = "https://string-db.org/api"
        output_format = "tsv-no-header"
        method = "network"
        request_url = "/".join([string_api_url, output_format, method])
        params = {

            "identifiers": "%0d".join(my_genes),  # your protein
            "species": species_id,  # species NCBI identifier
            "caller_identity": "mdibl.org"  # your app name
        }

        # read and parse results
        response = requests.post(request_url, data=params)
  • p0 through p4 represent the output fields from STRING (additional ones can be added)
  • p0 - stringId_A = input id
  • p1 - stringId_B = matched protein id
  • p2 - preferredName_A = common input protein name
  • p3 - preferredName_B = common matched protein name
  • p4 - experimental score
for line in response.text.strip().split("\n"):
            l = line.strip().split("\t")
            p0, p1, p2, p3, p4 = l[0], l[1], l[2], l[3], l[4]
            experimental_score = float(l[10])
            if experimental_score != 0:
                print("\t".join(
                    [p0, p1, p2, p3, p4, "experimentally confirmed (prob. %.3f)" % experimental_score]))

interactionPartners Function

  • provides the interactions between your set of proteins and all the other STRING proteins
  • limits the number of retrieved interaction per protein using "limit" parameter (high scoring will be first)
def interactionPartners():
        string_api_url = "https://string-db.org/api"
        output_format = "tsv-no-header"
        method = "interaction_partners"

        request_url = "/".join([string_api_url, output_format, method])

        params = {

            "identifiers": "%0d".join(my_genes),  # your protein
            "species": species_id,  # species NCBI identifier
            "limit": 1,
            "caller_identity": "mdibl.org"  # your app name
        }

        response = requests.post(request_url, data=params)

        for line in response.text.strip().split("\n"):

            l = line.strip().split("\t")
            query_ensp = l[0]
            query_name = l[2]
            partner_ensp = l[1]
            partner_name = l[3]
            combined_score = l[5]
            print("\t".join([query_ensp, query_name,
                                partner_ensp, partner_name, combined_score]))