#!/usr/bin/python
import sys
import math

def name_conversion(protein):
	alphabet = {"Q":"q", "W":"w", "E":"e", "R":"r", "T":"t", "Y":"y", "U":"u", "I":"i", "O":"o", "P": "p", "A":"a", "S":"s", "D":"d", "F":"f", "G":"g", "H":"h", "J":"j", "K":"k", "L":"l", "Z":"z", "X":"x", "C":"c", "V":"v", "B":"b", "N":"n", "M":"m", "0":"0", "1":"1", "2":"2", "3":"3", "4":"4", "5":"5", "6":"6", "7":"7", "8":"8", "9":"9"}
	new_protein = ""
	for i in range(0, len(protein)):
		new_protein = new_protein + alphabet[protein[i]]
	return new_protein

print "starting clusters selection"
	
with open("./support/new_clusters.txt") as f:
	content =  f.read().splitlines()

content.append("-1     ")

with open("./support/names_bg.txt") as g:
	names = g.read().splitlines()

max_j = int(content[-1][0:5]) 		#all clusters
j=0				#current cluster
i=0				#current line
current = str(content[i])

#data to align
cluster = []
cluster_chains = []
cluster_sequences = []
current_sequence = ""

#report collection
report = open("./hb_seq/report.txt","a")

while current[0:3]!="-1 ":

	#check if we have it in our HB data
	protein = current[16:20]
	chain = current[21]
	if protein in names:

		cluster.append(protein)
		cluster_chains.append(chain)

		#let's grab other chains with the same sequence (SAVE SEQUENCES and CHAINS)
		with open("./hb_res/NHB_"+name_conversion(protein)+".txt") as h:
			protein_data =  h.read().splitlines()

			k = 1
			while protein_data[k][0] != chain:
				k=k+4

			current_sequence = protein_data[k+3]
			cluster_sequences.append(current_sequence)

			"""
			k = 1
			while protein_data[k][0:10] != "LIST OF HY":
				if protein_data[k+3] == current_sequence and protein_data[k][0] != chain:
					cluster.append(protein)
					cluster_chains.append(protein_data[k][0])
					cluster_sequences.append(protein_data[k+3])
				k=k+4
			"""

	i=i+1		
	current=str(content[i])
	
	if int(current[0:5]) != j:

		if len(cluster) >= 10:

			print "cluster " + str(j) + " has been accepted ... " + str(len(cluster)) + "seqs has been found!"
			report.write(" "*(7-len(str(j))) + str(j) + " " + str(len(cluster))  +"\n")
			#let's save sequences to file
			sequence_file = open("./hb_seq/"+str(j)+"_a.fa","a")
			for l in range(0, len(cluster_sequences)):
				sequence_file.write(">" + cluster[l] + ":" + cluster_chains[l] +  "\n")
				sequence_file.write(str(cluster_sequences[l])+"\n")
			sequence_file.close()


		else:
			print "cluster " + str(j) + " has been rejected ..."

		cluster=[]
		cluster_sequences = []
		cluster_chains=[]
		j=int(current[0:5])


report.close()

print "clusters chosen, sequences saved"

