from numpy import array


class ClusterComparisonMetricComparison():

	def __init__(self):
		self.INCLUSION_DATA = None

	def get_arrays_from_file(self, candidate_index, filename='mrdata.txt'):
		if self.INCLUSION_DATA == None:
			lines = open('mrdata.txt', 'r').readlines()
			inclusion_data = []
			for line in lines[1:]:
				row = []
				for item in line.split()[1:9]:
					row.append(float(item))
				inclusion_data.append(row)
			self.INCLUSION_DATA = array(inclusion_data)
#		print self.INCLUSION_DATA
#		print self.INCLUSION_DATA.shape
		candidate = self.INCLUSION_DATA[:, candidate_index]#this is the one we'll calculate the comparison measures against.
		experts = []
		for i in range(len(self.INCLUSION_DATA[0])):
			if i != candidate_index:
				experts.append(self.INCLUSION_DATA[:, i])#these are the experts for these porpoises
		return candidate, array(experts).transpose()
	
	def print_summary_info(self, candidate, experts):
#		print "Number of Events gated = \t\t\t" + str(len(candidate))
		experts_agree_0 = 0
		experts_agree_1 = 0
		candidate_agree_0 = 0
		candidate_agree_1 = 0
		for i in range(len(candidate)):
			agree_0 = True
			agree_1 = True
			is_1 = 0
			is_0 = 0
			for each in experts[i]:
				if each != 0:
					agree_0 = False
					is_1 += 1
				if each != 1:
					agree_1 = False
					is_0 += 1
			if is_0 > is_1:#if agree_0:CHANGE THESE TWO COMMENTED SECTIONS AROUND TO SWAP BETWEEN EXPERT CONSENSUS AND AVERAGE
				experts_agree_0 += 1
				if candidate[i] == 0:
					candidate_agree_0 += 1
			elif is_1 > is_0:#agree_1:
				experts_agree_1 += 1
				if candidate[i] == 1:
					candidate_agree_1 += 1
			else:
				print "this line can't be reached there are seven experts!"
					
		print str((float(candidate_agree_0) / float(experts_agree_0)) * 100.0)#"Experts had consensus on exclusion = \t\t" + str(experts_agree_0) + ", " + str(candidate_agree_0) + "\t = candidate agrees (" + str((float(candidate_agree_0) / float(experts_agree_0)) * 100.0) + "%)"
#		print str((float(candidate_agree_1) / float(experts_agree_1)) * 100.0)#"Experts had consensus on inclusion = \t\t" + str(experts_agree_1) + ", " + str(candidate_agree_1) + "\t = candidate agrees (" + str((float(candidate_agree_1) / float(experts_agree_1)) * 100.0) + "%)"
#		print str((float(experts_agree_0 + experts_agree_1) / float(len(candidate))) * 100.0)# "Percent of Events Experts had consensus on = \t" + str((float(experts_agree_0 + experts_agree_1) / float(len(candidate))) * 100.0)
#		print str((float(candidate_agree_0 + candidate_agree_1)/float(experts_agree_0 + experts_agree_1))*100.0)#"Percent of events where experts had consensus that candidate agreed = " + str((float(candidate_agree_0 + candidate_agree_1)/float(experts_agree_0 + experts_agree_1))*100.0)
	
	
	def calculate_match_ratio(self, candidate, experts):
		"""for match ratio
	weight = distance of average of expert incl from 0.5 squared
	match ratio = sum(weight * did we get same yes or no as avg of experts? 1 : 0) / sum(weight)"""
		sum_weights_where_agree = 0
		sum_weights = 0
		for i in range(len(experts)):
			average = sum(experts[i]) / float(len(experts[i]))
			weight = abs(average - 0.5)**2
			sum_weights += weight
			if (average < 0.5 and candidate[i] == 0) or (average >= 0.5 and candidate[i] == 1):
				sum_weights_where_agree += weight
		return sum_weights_where_agree / sum_weights

	def calculate_mallows_distance(self, candidate, expert):
		"""Mallow's Distance:
		p(i) = probability of being either excluded or included, i.e.
			if experts average >= 0.5 -> experts average
			else -> 1 - experts average.
		cost_of_mistake = 0 if same yes or no as experts, 1 if not.
		mallows distance = sum(prob * cost_of_mistake)"""
		summation = 0
		for i in range(len(candidate)):
			average = sum(experts[i]) / float(len(experts[i]))		
			if average < 0.5 and candidate[i] == 0:
				summation += (1 -average)
			elif average >= 0.5 and candidate[i] == 1:
				summation += average
		return summation
		
	def calculate_v_measure(self, candidate, expert):
		"""V-Measure:
		ack = number of data points candidate cluster(k) includes agreed on by experts cluster(c)
		hck = - sum over all candidate clusters( sum over all expert clusters(ack / num_events) * log(ack / sum(ack) over all expert clusters)))
		hc =  - sum over experts clusters( sum over candidates clusters(ack) / num_events * log ( sum over candidate clusters(ack) / num_events))
		h = 1 - (hck/hc)
		
		c is symmetric to this, experts where candidate would be and vice versa. 
		c = 1 - (hkc/hk)
		we choose a beta of 1 (weighing completeness and homogeneity equally) and 
		v = (2 * h * c) / (h + c)"""

		k = candidate
		c = []
		for event in experts:
			average = sum(event) / float(len(event))
			if average < 0.5:
				c.append(0)
			else:
				c.append(1)
		h = 1 - (self.calc_v_measure_sums_2d(c, k) / self.calc_v_measure_sums_1d(c, k))		
		c = 1 - (self.calc_v_measure_sums_2d(k, c) / self.calc_v_measure_sums_1d(k, c))		
		return self.calc_v(h, c, 0.5)
	
	def calc_v(self, h, c, weight):
		top = (1 + weight) * h * c
		bottom = (weight * h) + c
		return top / bottom
				
	def calc_v_measure_sums_2d(self, c, k):
		import math
		summation = 0
		for cci in [True, False]:
			for eci in [True, False]:
				first_term = self.get_ack(k, cci, c, eci) / float(len(k))
				last_term_denom = 0
				for ecii in [True, False]:
					last_term_denom += self.get_ack(k, cci, c, ecii)
				last_term = float(self.get_ack(k, cci, c, eci)) / float(last_term_denom)
				#print "last_term = " + str(last_term)
				summation += first_term * math.log(last_term)
		return -summation	
		
	def calc_v_measure_sums_1d(self, c, k):
		import math
		summation = 0
		for eci in [True, False]:
			ack_sum = 0
			for cci in [True, False]:
				ack_sum += self.get_ack(k, cci, c, eci)
			avg_ack_sum = float(ack_sum) / float(len(k))
			summation += avg_ack_sum * math.log(avg_ack_sum) 
		return -summation

	def get_ack(self, k, kin, c, cin):
		if not kin:
			k = [abs(x - 1) for x in k]
		if not cin:
			c = [abs(x - 1) for x in c]
		ack = 0
		n = len(k)
		for i in range(n):
			if k[i] == 1 and c[i] == 1:
				ack += 1
#		print str(kin) + ", " + str(cin) + " = " + str(ack)
		return ack
		
	def test_get_ack(self):
		a = [1,1,1,1,1,0]
		b = [0,0,1,1,0,0]
		print "ack for in in expect 2, get " + str(self.get_ack(a, True, b, True))
		print "ack for in out expect 3, get " + str(self.get_ack(a, True, b, False))
		print "ack for out in expect 0, get " + str(self.get_ack(a, False, b, True))
		print "ack for out out expect 1, get " + str(self.get_ack(a, False, b, False))		
		
if __name__ == "__main__":	
	a = ClusterComparisonMetricComparison()
#	a.test_get_ack()
	
	for i in range(8):
#		print "CANDIDATE " + str(i)
		candidate, experts = a.get_arrays_from_file(i)
#		a.print_summary_info(candidate, experts)
#		print str(a.calculate_match_ratio(candidate, experts))#"match ratio = "  + str(a.calculate_match_ratio(candidate, experts))
		print str(a.calculate_mallows_distance(candidate, experts))#"Mallow's distance = " + str(a.calculate_mallows_distance(candidate, experts))
#		print str(a.calculate_v_measure(candidate, experts))# "V Measure = " + str(a.calculate_v_measure(candidate, experts))
#		print
	
