from numpy import array class ClusterComparisonMetricComparison(): def __init__(self): self.INCLUSION_DATA = None def get_arrays_from_file(self, candidate_index, filename='mrdata.txt'): if self.INCLUSION_DATA == None: lines = open('mrdata.txt', 'r').readlines() inclusion_data = [] for line in lines[1:]: row = [] for item in line.split()[1:9]: row.append(float(item)) inclusion_data.append(row) self.INCLUSION_DATA = array(inclusion_data) # print self.INCLUSION_DATA # print self.INCLUSION_DATA.shape candidate = self.INCLUSION_DATA[:, candidate_index]#this is the one we'll calculate the comparison measures against. experts = [] for i in range(len(self.INCLUSION_DATA[0])): if i != candidate_index: experts.append(self.INCLUSION_DATA[:, i])#these are the experts for these porpoises return candidate, array(experts).transpose() def print_summary_info(self, candidate, experts): # print "Number of Events gated = \t\t\t" + str(len(candidate)) experts_agree_0 = 0 experts_agree_1 = 0 candidate_agree_0 = 0 candidate_agree_1 = 0 for i in range(len(candidate)): agree_0 = True agree_1 = True is_1 = 0 is_0 = 0 for each in experts[i]: if each != 0: agree_0 = False is_1 += 1 if each != 1: agree_1 = False is_0 += 1 if is_0 > is_1:#if agree_0:CHANGE THESE TWO COMMENTED SECTIONS AROUND TO SWAP BETWEEN EXPERT CONSENSUS AND AVERAGE experts_agree_0 += 1 if candidate[i] == 0: candidate_agree_0 += 1 elif is_1 > is_0:#agree_1: experts_agree_1 += 1 if candidate[i] == 1: candidate_agree_1 += 1 else: print "this line can't be reached there are seven experts!" print str((float(candidate_agree_0) / float(experts_agree_0)) * 100.0)#"Experts had consensus on exclusion = \t\t" + str(experts_agree_0) + ", " + str(candidate_agree_0) + "\t = candidate agrees (" + str((float(candidate_agree_0) / float(experts_agree_0)) * 100.0) + "%)" # print str((float(candidate_agree_1) / float(experts_agree_1)) * 100.0)#"Experts had consensus on inclusion = \t\t" + str(experts_agree_1) + ", " + str(candidate_agree_1) + "\t = candidate agrees (" + str((float(candidate_agree_1) / float(experts_agree_1)) * 100.0) + "%)" # print str((float(experts_agree_0 + experts_agree_1) / float(len(candidate))) * 100.0)# "Percent of Events Experts had consensus on = \t" + str((float(experts_agree_0 + experts_agree_1) / float(len(candidate))) * 100.0) # print str((float(candidate_agree_0 + candidate_agree_1)/float(experts_agree_0 + experts_agree_1))*100.0)#"Percent of events where experts had consensus that candidate agreed = " + str((float(candidate_agree_0 + candidate_agree_1)/float(experts_agree_0 + experts_agree_1))*100.0) def calculate_match_ratio(self, candidate, experts): """for match ratio weight = distance of average of expert incl from 0.5 squared match ratio = sum(weight * did we get same yes or no as avg of experts? 1 : 0) / sum(weight)""" sum_weights_where_agree = 0 sum_weights = 0 for i in range(len(experts)): average = sum(experts[i]) / float(len(experts[i])) weight = abs(average - 0.5)**2 sum_weights += weight if (average < 0.5 and candidate[i] == 0) or (average >= 0.5 and candidate[i] == 1): sum_weights_where_agree += weight return sum_weights_where_agree / sum_weights def calculate_mallows_distance(self, candidate, expert): """Mallow's Distance: p(i) = probability of being either excluded or included, i.e. if experts average >= 0.5 -> experts average else -> 1 - experts average. cost_of_mistake = 0 if same yes or no as experts, 1 if not. mallows distance = sum(prob * cost_of_mistake)""" summation = 0 for i in range(len(candidate)): average = sum(experts[i]) / float(len(experts[i])) if average < 0.5 and candidate[i] == 0: summation += (1 -average) elif average >= 0.5 and candidate[i] == 1: summation += average return summation def calculate_v_measure(self, candidate, expert): """V-Measure: ack = number of data points candidate cluster(k) includes agreed on by experts cluster(c) hck = - sum over all candidate clusters( sum over all expert clusters(ack / num_events) * log(ack / sum(ack) over all expert clusters))) hc = - sum over experts clusters( sum over candidates clusters(ack) / num_events * log ( sum over candidate clusters(ack) / num_events)) h = 1 - (hck/hc) c is symmetric to this, experts where candidate would be and vice versa. c = 1 - (hkc/hk) we choose a beta of 1 (weighing completeness and homogeneity equally) and v = (2 * h * c) / (h + c)""" k = candidate c = [] for event in experts: average = sum(event) / float(len(event)) if average < 0.5: c.append(0) else: c.append(1) h = 1 - (self.calc_v_measure_sums_2d(c, k) / self.calc_v_measure_sums_1d(c, k)) c = 1 - (self.calc_v_measure_sums_2d(k, c) / self.calc_v_measure_sums_1d(k, c)) return self.calc_v(h, c, 0.5) def calc_v(self, h, c, weight): top = (1 + weight) * h * c bottom = (weight * h) + c return top / bottom def calc_v_measure_sums_2d(self, c, k): import math summation = 0 for cci in [True, False]: for eci in [True, False]: first_term = self.get_ack(k, cci, c, eci) / float(len(k)) last_term_denom = 0 for ecii in [True, False]: last_term_denom += self.get_ack(k, cci, c, ecii) last_term = float(self.get_ack(k, cci, c, eci)) / float(last_term_denom) #print "last_term = " + str(last_term) summation += first_term * math.log(last_term) return -summation def calc_v_measure_sums_1d(self, c, k): import math summation = 0 for eci in [True, False]: ack_sum = 0 for cci in [True, False]: ack_sum += self.get_ack(k, cci, c, eci) avg_ack_sum = float(ack_sum) / float(len(k)) summation += avg_ack_sum * math.log(avg_ack_sum) return -summation def get_ack(self, k, kin, c, cin): if not kin: k = [abs(x - 1) for x in k] if not cin: c = [abs(x - 1) for x in c] ack = 0 n = len(k) for i in range(n): if k[i] == 1 and c[i] == 1: ack += 1 # print str(kin) + ", " + str(cin) + " = " + str(ack) return ack def test_get_ack(self): a = [1,1,1,1,1,0] b = [0,0,1,1,0,0] print "ack for in in expect 2, get " + str(self.get_ack(a, True, b, True)) print "ack for in out expect 3, get " + str(self.get_ack(a, True, b, False)) print "ack for out in expect 0, get " + str(self.get_ack(a, False, b, True)) print "ack for out out expect 1, get " + str(self.get_ack(a, False, b, False)) if __name__ == "__main__": a = ClusterComparisonMetricComparison() # a.test_get_ack() for i in range(8): # print "CANDIDATE " + str(i) candidate, experts = a.get_arrays_from_file(i) # a.print_summary_info(candidate, experts) # print str(a.calculate_match_ratio(candidate, experts))#"match ratio = " + str(a.calculate_match_ratio(candidate, experts)) print str(a.calculate_mallows_distance(candidate, experts))#"Mallow's distance = " + str(a.calculate_mallows_distance(candidate, experts)) # print str(a.calculate_v_measure(candidate, experts))# "V Measure = " + str(a.calculate_v_measure(candidate, experts)) # print