Skip to content

Commit

Permalink
update data discrepancy check
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinxin90 committed Oct 24, 2017
1 parent 4a8d4d0 commit 8da076c
Showing 1 changed file with 63 additions and 0 deletions.
63 changes: 63 additions & 0 deletions src/Demo for Data Discrepancy Check.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,69 @@
" if cnt > total_docs:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching 424525377 variant(s) . . .\n"
]
}
],
"source": [
"############################################################################\n",
"# Please note looping through all docs in MyVariant.info would take a long\n",
"# time. Thus, for demo purpose, we set the limit to the first 20,000,000 docs.\n",
"# You could change the value of total_docs to scan more docs. You could find \n",
"# more hgvs_ids in the output csv file 'af_afr_discrepancy_check.csv'.\n",
"############################################################################\n",
"total_docs = 20000000\n",
"with open('af_afr_discrepancy_check.csv', 'w') as csvfile:\n",
" # count the total number of docs scanned\n",
" cnt = 0\n",
" # json-ld context file for MyVariant.info\n",
" context = load_context('myvariant.info')\n",
" # write the header for csv file\n",
" fieldnames = ['hgvs_id']\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
" writer.writeheader()\n",
" # get all docs in MyVariant.info\n",
" mv = get_client('variant')\n",
" data = mv.query(q='__all__', fetch_all=True)\n",
" # loop through each doc, apply jsonld context \n",
" for doc in data:\n",
" cnt += 1\n",
" if cnt % 50000 ==0:\n",
" print('{} docs have been scanned'.format(cnt))\n",
" # only these sources contain allele frequency information, so only apply json-ld when one or more these sources appear\n",
" if ('cadd' or 'dbnsfp') in doc:\n",
" try:\n",
" doc = flatten_doc(doc)\n",
" doc.update(context)\n",
" nquads_doc = nquads_transform(doc)\n",
" # please note 'http://identifiers.org/af.afr/' is a placeholder for the URI representing afriacan population allele frequency\n",
" af = fetch_value_by_uri(nquads_doc, \"http://identifiers.org/af.afr/\")\n",
" if af and type(af) == list:\n",
" af = [float[_af] for _af in af]\n",
" af.sort()\n",
" if af[-1] * 0.5 > af[0]:\n",
" writer.writerow({'hgvs_id': doc['_id']})\n",
" except:\n",
" #print('error id {}'.format(doc['_id']))\n",
" continue\n",
" else:\n",
" continue\n",
" if cnt > total_docs:\n",
" break"
]
}
],
"metadata": {
Expand Down

0 comments on commit 8da076c

Please sign in to comment.