Package rdkit :: Package ML :: Package Scoring :: Module Scoring
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Scoring.Scoring

  1  """ 
  2  $Id$ 
  3   
  4  Scoring - Calculate rank statistics 
  5   
  6  Created by Sereina Riniker, October 2012 
  7  after a file from Peter Gedeck, Greg Landrum 
  8   
  9  """ 
 10   
 11  import math, exceptions 
 12   
 13  """ 
 14  \param scores: ordered list with descending similarity containing  
 15                 active/inactive information 
 16  \param col: column index in scores where active/inactive information is stored 
 17  \param fractions: list of fractions at which the value shall be calculated 
 18  \param alpha: exponential weight 
 19  """ 
 20   
21 -def CalcROC(scores, col):
22 """ Determines a ROC curve """ 23 numMol = len(scores) 24 if numMol == 0: raise ValueError('score list is empty') 25 TPR = [0]*numMol # True positive rate: TP/(TP+FP) 26 TNR = [0]*numMol # True negative rate: TN/(TN+FN) 27 numActives = 0 28 numInactives = 0 29 30 # loop over score list 31 for i in range(numMol): 32 if scores[i][col]: 33 numActives += 1 34 else: 35 numInactives +=1 36 TPR[i] = numActives # TP 37 TNR[i] = numInactives # TN 38 39 # normalize, check that there are actives and inactives 40 if numActives > 0: 41 TPR = [1.0*i / numActives for i in TPR] 42 if numInactives > 0: 43 TNR = [1.0*i / numInactives for i in TNR] 44 45 return [TNR, TPR]
46
47 -def CalcAUC(scores, col):
48 """ Determines the area under the ROC curve """ 49 # determine the ROC curve 50 roc = CalcROC(scores, col) 51 TNR = roc[0] 52 TPR = roc[1] 53 54 numMol = len(scores) 55 AUC = 0 56 57 # loop over score list 58 for i in range(0, numMol-1): 59 AUC += (TNR[i+1]-TNR[i]) * (TPR[i+1]+TPR[i]) 60 61 return 0.5*AUC
62
63 -def _RIEHelper(scores, col, alpha):
64 numMol = len(scores) 65 alpha = float(alpha) 66 if numMol == 0: raise ValueError('score list is empty') 67 if alpha <= 0.0: raise ValueError('alpha must be greater than zero') 68 69 denom = 1.0/numMol * ((1-math.exp(-alpha)) / (math.exp(alpha/numMol) -1)) 70 numActives = 0 71 sum_exp = 0 72 73 # loop over score list 74 for i in range(numMol): 75 active = scores[i][col] 76 if active: 77 numActives += 1 78 sum_exp += math.exp(-(alpha*(i+1)) / numMol) 79 80 if numActives > 0: # check that there are actives 81 RIE = sum_exp / (numActives * denom) 82 else: 83 RIE = 0.0 84 85 return RIE, numActives
86
87 -def CalcRIE(scores, col, alpha):
88 """ RIE original definded here: 89 Sheridan, R.P., Singh, S.B., Fluder, E.M. & Kearsley, S.K. 90 Protocols for Bridging the Peptide to Nonpeptide Gap in Topological Similarity Searches. 91 J. Chem. Inf. Comp. Sci. 41, 1395-1406 (2001). 92 """ 93 RIE, numActives = _RIEHelper(scores, col, alpha) 94 return RIE
95
96 -def CalcBEDROC(scores, col, alpha):
97 """ BEDROC original defined here: 98 Truchon, J. & Bayly, C.I. 99 Evaluating Virtual Screening Methods: Good and Bad Metric for the "Early Recognition" 100 Problem. J. Chem. Inf. Model. 47, 488-508 (2007). 101 """ 102 # calculate RIE 103 RIE, numActives = _RIEHelper(scores, col, alpha) 104 105 if numActives > 0: 106 numMol = len(scores) 107 ratio = 1.0*numActives / numMol 108 RIEmax = (1-math.exp(-alpha*ratio)) / (ratio*(1-math.exp(-alpha))) 109 RIEmin = (1-math.exp(alpha*ratio)) / (ratio*(1-math.exp(alpha))) 110 111 if RIEmax != RIEmin: 112 BEDROC = (RIE - RIEmin) / (RIEmax - RIEmin) 113 else: # numActives = numMol 114 BEDROC = 1.0 115 else: 116 BEDROC = 0.0 117 118 return BEDROC
119
120 -def CalcEnrichment(scores, col, fractions):
121 """ Determines the enrichment factor for a set of fractions """ 122 numMol = len(scores) 123 if numMol == 0: raise ValueError('score list is empty') 124 if len(fractions) == 0: raise ValueError('fraction list is empty') 125 for i in fractions: 126 if i > 1 or i < 0: raise ValueError('fractions must be between [0,1]') 127 128 numPerFrac = [math.ceil(numMol*f) for f in fractions] 129 numPerFrac.append(numMol) 130 numActives = 0 131 enrich = [] 132 133 # loop over score list 134 for i in range(numMol): 135 if i > (numPerFrac[0]-1) and i > 0: 136 enrich.append(1.0*numActives*numMol / i) 137 numPerFrac.pop(0) 138 active = scores[i][col] 139 if active: numActives += 1 140 141 if numActives > 0: # check that there are actives 142 enrich = [e / numActives for e in enrich] 143 else: 144 enrich = [0.0]*len(fractions) 145 return enrich
146 # 147 # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. 148 # All rights reserved. 149 # 150 # Redistribution and use in source and binary forms, with or without 151 # modification, are permitted provided that the following conditions are 152 # met: 153 # 154 # * Redistributions of source code must retain the above copyright 155 # notice, this list of conditions and the following disclaimer. 156 # * Redistributions in binary form must reproduce the above 157 # copyright notice, this list of conditions and the following 158 # disclaimer in the documentation and/or other materials provided 159 # with the distribution. 160 # * Neither the name of Novartis Institutes for BioMedical Research Inc. 161 # nor the names of its contributors may be used to endorse or promote 162 # products derived from this software without specific prior written permission. 163 # 164 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 165 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 166 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 167 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 168 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 169 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 170 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 171 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 172 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 173 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 174 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 175 # 176