class TaxonomyToGene

	attr_reader :known_genes, :taxa_with_tax_obj, :species_with_corresponding_genes, 
		:species_not_found_in_NCBI_tax, :species_without_gene_structures


	# path_to_taxdump: path to NCBI taxonomy dump file
	# path_to_linked_list: path to file specifing which fasta header belongs to which species
	# genes_with_data: list of all genes with gene structure (ignore all other genes named in path_to_linked_list)
	def initialize(path_to_taxdump, path_to_linked_list_genenames_speciesnames, is_no_grep, is_call_grep_with_nice, genes_with_data )

		@species_not_found_in_NCBI_tax = []
		@species_without_gene_structures = []

		@grep = `which grep`.chomp
		if is_call_grep_with_nice then 
			@be_nice = true
		end

		@species_with_corresponding_genes = self.class.map_genenames_to_speciesnames( path_to_linked_list_genenames_speciesnames, genes_with_data )
		@known_genes = @species_with_corresponding_genes.values.flatten.uniq

		@taxa_with_tax_obj = extract_taxonomy_of_species_in_alignment_from_taxdump( path_to_taxdump, is_no_grep ) # Hash: key= taxon, value: taxonomie-object
	end

	# read in file containing mapping between fasta header (=gene names) and species
	def self.map_genenames_to_speciesnames(path_to_linked_list, genes_with_data)
		species2genes = {}
		IO.foreach(path_to_linked_list) do |line|

			line = line.chomp
			next if line.empty? 

			parts = line.split(/:/x) # ignore white spaces surrounding ":"
			genes = parts.shift
			species = parts.join(":")
			if genes.nil? || species.nil? then
				Helper.abort "Invalid syntax in file #{path_to_linked_list}. Expecting \':\'-separated list of genes and species"
			end
			species = species.strip # remove leading & trailing white spaces
			genes = genes.split(/,/).map { |g| g.strip }

			if ! genes.is_overlapping_set?(genes_with_data) then 
				# for this genes, no gene structures exist. do not bother with their taxonomy
				@species_without_gene_structures |= [species]
				next
			end

			if ! species2genes[species] then 
				species2genes[species] = []
			end
			species2genes[species] |= genes
		end

		return species2genes
	end

	# extract lineage for all species present in alignment from taxdump
	# uses grep, as taxdump is large file and ram is restricted
	def extract_taxonomy_of_species_in_alignment_from_taxdump( path_to_taxdump, is_no_grep )

		lineage_by_species = {}

		# extract names and nodes from tar archive
		dir_taxdump = File.dirname(path_to_taxdump)
		path_to_namesdmp = File.join( dir_taxdump, "names.dmp" )
		path_to_nodesdmp = File.join( dir_taxdump, "nodes.dmp")
		if ! ( Helper.file_exist?( path_to_namesdmp ) && Helper.file_exist?( path_to_nodesdmp ) ) then 
			# last attemp to get files; maybe they simply need to be extracted
			tar = `which tar`.chomp
			is_success = system("cd #{dir_taxdump} && #{tar} -xvf #{File.absolute_path(path_to_taxdump)} names.dmp nodes.dmp >/dev/null 2>&1")
			Helper.file_exist_or_die(path_to_nodesdmp)
			Helper.file_exist_or_die(path_to_namesdmp)

		end

		# process all species names and find their taxid

		if is_no_grep then 
			# read names and nodes into RAM

			nodes_with_parents_and_name = {}
			species_with_taxid = {}

			IO.foreach(path_to_nodesdmp) do |line|
				line.chomp!
				parts = line.split("\t|\t")
				taxid = parts[0].to_i
				parent_taxid = parts[1].to_i
				nodes_with_parents_and_name[taxid] = [ parent_taxid ]
			end

			IO.foreach(path_to_namesdmp) do |line|
				line.chomp!
				if line.include?("scientific name") then 
					parts = line.split("\t|\t")
					taxid = parts[0].to_i
					name = parts[1]
					nodes_with_parents_and_name[taxid].push(name)
					if get_all_species_linked_to_genes.include?(name) then 
						species_with_taxid[name] = taxid
					end
				end
			end

			get_all_species_linked_to_genes.each do |species|

				species, lineage = get_lineage_by_species_no_grep(species, nodes_with_parents_and_name, species_with_taxid)
				# parsing taxonomy for species might have been not successful!
				# in that case, nil values will be returned
				if lineage then 
					lineage_by_species[species] = lineage
				else
					# remove species from list of linked species
					# collect species name to output information
					unlink_species_from_gene(species)
					@species_not_found_in_NCBI_tax.push( species )
				end	
			end

		else
			# # grep 
			# threads = []
			# get_all_species_linked_to_genes.each do |species|
			# 		threads << Thread.new{ Thread.current[:output] = get_lineage_by_species(species, path_to_nodesdmp, path_to_namesdmp) }
			# end
			# threads.each do |tr|
			# 	tr.join
			# 	species, lineage = tr[:output]
			# 	# parsing taxonomy for species might have been not successful!
			# 	# in that case, nil values will be returned
			# 	if lineage then 
			# 		lineage_by_species[species] = lineage
			# 	else
			# 		# remove species from list of linked species
			# 		# collect species name to output information
			# 		unlink_species_from_gene(species)
			# 		@species_not_found_in_NCBI_tax.push( species )
			# 	end
			# end

			# grep 
			# threads = []
			get_all_species_linked_to_genes.each_slice(10) do |slice|
				print "."

				threads = []
				slice.each do |species|
					threads << Thread.new{ Thread.current[:output] = get_lineage_by_species(species, path_to_nodesdmp, path_to_namesdmp) }
				end
				threads.each do |tr|
					tr.join
					species, lineage = tr[:output]
					# parsing taxonomy for species might have been not successful!
					# in that case, nil values will be returned
					if lineage then 
						lineage_by_species[species] = lineage
					else
						# remove species from list of linked species
						# collect species name to output information
						unlink_species_from_gene(species)
						@species_not_found_in_NCBI_tax.push( species )
					end
				end
			end

		end

		# convert lineages to taxonomy objects (cant do this in one step, as distance to root is only known when having complete lingeage)
		tax_objs_by_name = {}

		get_all_species_linked_to_genes.combination(2).each do |species1, species2|

			lineage1 = lineage_by_species[species1].reverse
			lineage2 = lineage_by_species[species2].reverse

			lca = (lineage1 & lineage2).last # last as lineages are in reverse order: from root to species

			add_or_update_taxonomy_obj(tax_objs_by_name, lineage1, species1, lca)
			add_or_update_taxonomy_obj(tax_objs_by_name, lineage2, species2, lca)

		end

		return tax_objs_by_name
	end

	# creates/updates taxonomy object for every taxon of lineage
	# input lineage is in reverse order: from root to species
	def add_or_update_taxonomy_obj(tax_objs_by_name, lineage, species, lca)

		lineage.each_with_index do |taxon, ind|
			if tax_objs_by_name[taxon] then 
				# update children
				tax_objs_by_name[taxon].add_child(species)
				
				# update descendants
				if ind != (lineage.size - 1 ) then 
					descendant = lineage[ind+1]
					tax_objs_by_name[taxon].add_descendant(descendant)
				end
			else
				# create object

				# find ancestor and descendant of taxon
				if ind == 0 then 
					ancestor = ""
				else
					ancestor = lineage[ind-1]
				end
				if ind == (lineage.size - 1) then 
					descendant = ""
				else
					descendant = lineage[ind+1]
				end

				tax_objs_by_name[taxon] = Taxonomy.new(
					taxon, # name
					ancestor,
					descendant, # child (node)
					ind, # distance to root
					species # child (leaf)
				)
			end
			if taxon == lca then 
				tax_objs_by_name[taxon].add_lca(species)
			end
		end
	end

	# returns to which species (list) the input genes belong
	def get_species_by_genes(genes_list)
		species_list = []
		@species_with_corresponding_genes.each do |species, genes|
			if genes.is_overlapping_set?(genes_list) then 
				species_list.push( species )
			end
		end
		return species_list
	end

	def get_last_common_ancestor_of_all_known_species
		all_species = get_all_species_linked_to_genes
		return get_last_common_ancestor_of(all_species)
	end

	# finds the last common ancestor of species defined species_list in putative_last_common_ancestor_list
	def get_last_common_ancestor_of(species_list)
		
		lca = ""
		@taxa_with_tax_obj.each do |taxon, taxon_obj|
			# taxon is lca of all species in list
			if taxon_obj.is_last_common_ancestor? && species_list.is_subset?( @taxa_with_tax_obj[taxon].last_common_ancestor_of ) then 
				if lca.empty? then 
					# this is the first lca found, save it
					lca = taxon
				else
					# compare this lca to last found lca: chose lca with greater distance to root
					distance_to_root_this_lca = @taxa_with_tax_obj[taxon].distance_to_root
					distance_to_root_last_found_lca = @taxa_with_tax_obj[lca].distance_to_root
					if distance_to_root_this_lca > distance_to_root_last_found_lca then 
						lca = taxon
					end
				end
			end
		end
		return lca
	end

	# finds first uniq ancestor for every species associated with gene from genes list
	# also returns number of genes per first uniq ancestor
	def get_first_uniq_ancestors_with_frequencies_by_genes(genes_list, lca)

		first_uniq_list = []
		occurence_first_uniq = []

		children_of_lca = @taxa_with_tax_obj[lca].children
		descendants_of_lca = @taxa_with_tax_obj[lca].descendants

		@species_with_corresponding_genes.each do |species, genes|
			if genes.is_overlapping_set?(genes_list) then

				n_genes_encoded_by_this_species = genes.intersection(genes_list).size

				# first uniq ancestor of this species
				first_uniq = descendants_of_lca.find do |taxon|
					@taxa_with_tax_obj[taxon].children.include?(species)
				end
				if first_uniq then 

					# if lca is a species, then there is no first uniq ancestor
					ind_in_results = first_uniq_list.index(first_uniq)
					if ind_in_results then
						# update counts
						occurence_first_uniq[ind_in_results] += n_genes_encoded_by_this_species
					else
						# add first_uniq and update counts
						first_uniq_list.push(first_uniq)
						occurence_first_uniq.push(n_genes_encoded_by_this_species)
					end
				end
			end
		end 

		return first_uniq_list, occurence_first_uniq

	end

	def get_direct_descendants_of_taxon(taxon)
		@taxa_with_tax_obj[taxon].descendants
	end

	def is_taxon_species(taxon)
		if @taxa_with_tax_obj.has_key?(taxon) then 
			return @taxa_with_tax_obj[taxon].is_species?
		else
			# not an taxon of list
			return false
		end
	end

	# save taxonomy as newick tree file
	def export_as_phb(genes_list, alternative_names={})

		# select start taxon
		species_list = get_species_by_genes(genes_list)
		start_taxon = get_last_common_ancestor_of(species_list)

		return print_tree(@taxa_with_tax_obj[start_taxon], alternative_names)
	end


	# init recursion
	def print_tree(root, alternative_names)
		str = "("
		str += print_subtree(root, alternative_names)
		str += ");"
		return str.gsub( ",)", ")" )
# root:
# open bracket
# each desc:
# 	if no desc then 
# 		write "name:len,"
# 	end
# 	if excatly one desc then 
# 		desc: ... (same as each desc)
# 	end
# 	if more than one desc then 
# 		open bracket
# 		each desc: ...
# 		write close bracket, "name:len,"
# 	end
# write close bracket, ";"
# gsub( ",)", ")" )
	end
	# do recursion
	def print_subtree(taxon, alternative_names)
		str = ""
		branch_length = 5

		if taxon.descendants.size == 0 then 
			# its a leaf
			if alternative_names[taxon.name] then 
				taxon_name = alternative_names[taxon.name]
			else
				taxon_name = Helper.sanitize_taxon_name(taxon.name)
			end
			str += "#{taxon_name}:#{branch_length},"
		end 
		if taxon.descendants.size == 1 then 
			# its a inner node, but not an important one
			str += print_subtree(@taxa_with_tax_obj[taxon.descendants.first], alternative_names)
		end
		if taxon.descendants.size > 1 then 
			# its an important inner node
			str += "("
			taxon.descendants.each do |child|
				str += print_subtree(@taxa_with_tax_obj[child], alternative_names)
			end
			if alternative_names[taxon.name] then 
				taxon_name = alternative_names[taxon.name]
			else
				taxon_name = Helper.sanitize_taxon_name(taxon.name)
			end
			str += ")#{taxon_name}:#{branch_length},"
		end
		return str
	end


# FOR 'CONSENSUS TREE'
# DON'T FORGET TO BACKUP OLD RECURSION FORMULAS
# # takes a list of selected taxa as input and generates a subset of tax objs needed to generate a tree out of these taxa
# def prepare_tax_obj_for_phb( selected_taxa, root )

# 	subset_taxa_with_obj = {}
# 	select_tax_obj_for_subtree( @taxa_with_tax_obj[root], selected_taxa, subset_taxa_with_obj )

# 	return subset_taxa_with_obj

# end

# def select_tax_obj_for_subtree( taxon, all_selected_taxa, all_selected_obj )

# 	n_obj_added_by_child = 0
# 	taxon.descendants.each do |child|
# 		n_obj_added_by_child += select_tax_obj_for_subtree( @taxa_with_tax_obj[child], all_selected_taxa, all_selected_obj )
# 	end

# 	if all_selected_taxa.include?( taxon.name ) then 

# 		# add own tax obj to selected_objs
# 		if n_obj_added_by_child == 1 then 
# 			# need to add another child, otherwise tree is not valid anymore
# 			children_not_added = (taxon.descendants - all_selected_obj.keys)
# 			if children_not_added.any? then 
# 				# one of children was not added, add this child for tree's sake
# 				first_child = children_not_added.first
# 				all_selected_obj[first_child] = @taxa_with_tax_obj[first_child]
# 			else
# 				# all children were added -> invent a dummy-child
# 				dummy_name = "Unknown node"
# 				dummy_obj = Taxonomy.new(
# 					dummy_name, # name
# 					taxon.name, # ancestor, is this taxon itself
# 					"", # child, empty
# 					taxon.distance_to_root + 1, # distance to root, one more than from taxon
# 					dummy_name # child (leafs), is dummy itself
# 				)
# 				all_selected_obj[dummy_name] = dummy_obj		
# 			end
# 		end
# 		all_selected_obj[taxon.name] = taxon # add taxon itself

# 		return 1 # has added itself
# 	else

# 		# do not add own tax obj to selected_objs
# 		return n_obj_added_by_child # has not added itself, but some children might have
# 	end
# end
# FOR 'CONSENSUS TREE'



	# # finds first uniq ancestors of species list
	# # input: species list
	# # species frequency (same order as species list)
	# # if optional argument lca ist not provided, the last common ancestor of all species will be calculated first
	# # output might be empty, if lca is a species ( a species has no further descendants)
	# def get_first_uniq_ancestors_with_frequency_of(species_list, species_frequencies, *lca)
	# 	first_uniq_list = []
	# 	occurence_first_uniq = []
	# 	if lca.empty? then 
	# 		lca = get_last_common_ancestor_of(species_list)
	# 	else
	# 		lca = lca.first
	# 	end
	# 	@taxa_with_tax_obj[lca].descendants.each do |taxon|
	# 		children = @taxa_with_tax_obj[taxon].children
	# 		if children.is_overlapping_set?(species_list) then 
	# 			first_uniq_list.push( taxon )
	# 			occurence_first_uniq.push( children.intersection(species_list).size )
	# 		end
	# 	end
	# 	return first_uniq_list, occurence_first_uniq
	# end

	# returns genes encoded by children of input
	def get_genes_encoded_by_taxon(taxon)
		if @taxa_with_tax_obj[taxon] then 
			return @taxa_with_tax_obj[taxon].children.collect do |child|
				get_genes_encoded_by_species(child)
			end
		else
			return []
		end
	end
	def get_genes_encoded_by_species(species)
		if @species_with_corresponding_genes[species] then 
			return @species_with_corresponding_genes[species].uniq
		else
			return []
		end
	end

	def get_all_species_linked_to_genes
		@species_with_corresponding_genes.keys
	end

	def unlink_species_from_gene(species)
		@species_with_corresponding_genes.delete(species)
	end

	def get_lineage_by_species(species, path_to_nodesdmp, path_to_namesdmp)
		# get taxonomy of this species
		lineage = [species]

		taxid = get_taxid_by_name(species, path_to_namesdmp)
		while taxid != "1" do
			parent_taxid, parent_name = get_parent_taxid_with_name(taxid, path_to_nodesdmp, path_to_namesdmp)
			lineage << parent_name.capitalize
			taxid = parent_taxid
		end

		return species, lineage
	rescue
		return species, nil
	end

	def get_lineage_by_species_no_grep(species, nodes_with_parents_and_name, species_with_taxid)
		# get taxonomy of this species
		lineage = [species]

		taxid = species_with_taxid[species]
		while taxid != 1 do
			parent_taxid, parent_name = nodes_with_parents_and_name[taxid]
			lineage << parent_name.capitalize
			taxid = parent_taxid
		end

		return species, lineage
	rescue
		return species, nil
	end

	def get_parent_taxid_with_name(taxid, path_to_nodesdmp, path_to_namesdmp)
		# find parent tax id
		parent_taxid = get_parent_taxid(taxid, path_to_nodesdmp)

		# find corresponding name
		parent_name = get_name_by_taxid(parent_taxid, path_to_namesdmp)

		return parent_taxid, parent_name
	end

	# structure of names.dmp
	# tax_id\t|\tname\t|\t ...
	# structure of nodes.dmp
	# tax_id\t|\tparent tax_id\t|\t ...
	def get_parent_taxid(taxid, file)
		output_parts = grep_in_dmp("^#{taxid}\t", file)
		return output_parts[1]
	end
	def get_name_by_taxid(taxid, file)
		# output_parts = grep_in_dmp("^#{taxid}\t", file)
		output_parts = grep_pipe_without_maxcount("^#{taxid}\t", "scientific name", file)
		return output_parts[1]
	end
	def get_taxid_by_name(name, file)
		output_parts = grep_in_dmp("\t#{name}\t", file)
		return output_parts[0]
	end
	def grep_in_dmp(search_term, file)
		# consider wrapping search_term in "\t" to avoid false positives
		command = "#{@grep} --max-count=1 -i -P '#{search_term}' #{file}"
		if @be_nice then 
			command = "nice #{command}"
		end
		io = IO.popen(command)
		output = io.read # returns string,buffer or nil
		io.close
		if output.respond_to?('split') then 
			output.split(/\t?\|\t?/)
		else
			return ["",""] 
		end
	end
	def grep_pipe_without_maxcount(search_term1, search_term2, file)
		command = "#{@grep} -i -P '#{search_term1}' #{file} | #{@grep} -P '#{search_term2}'"
		if @be_nice then 
			command = "nice #{command}"
		end
		io = IO.popen(command)
		output = io.read # returns string,buffer or nil
		io.close
		if output.respond_to?('split') then 
			output.split(/\t?\|\t?/)
		else
			return ["",""] 
		end
	end

end
