//============================================================================
// Name        : Csfasta Qual to Illumina Phred 33 Fastq (cq2ip33fq)
// Author      : Nikhil Vinod Mallela
// Version     : 0.2 - modified E.Korsching
// Licence     : GNU GPLv3
// Description : This is to take a csfast and qual files and convert them to
//				 illumina fastq format with colorspace decoded to basespace
//				 following the 2-base encoding matrix. The sanger qualities
//				 are also converted to ASCII format after add 33 to adjust
//				 to illumina phred 33 scale.
//============================================================================

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/lexical_cast.hpp>

#include <boost/program_options.hpp>
#include <sstream>

using namespace std;
using namespace boost;
using namespace boost::program_options;


// global variables to hold the transformed quals and sequences
string baseseq;
string qualseq;
std::string csfasta;
std::string qual;

void cs2bs(string cstring);
void qual2phred33(string qstring);


bool cmdlargs(int argc, char* argv[])
{
    if (argc != 3) { // We expect 3 arguments: the program name, the input path1, the input path2
        std::cerr << "Usage: " << argv[0] << "  INPUT.color.fa  INPUT.qual.fa" << std::endl;
        return 1;	// return error
    }
    csfasta=argv[1];		// save in global var
    qual=argv[2];
    return 0;  // return ok
}


int main(int argc, char** argv) {

	bool result = cmdlargs(argc, argv);
	if (result)
	  return 1;

	string csfasta_line;
	string qual_line;

	ifstream input_csfasta(csfasta);
	ifstream input_qual(qual);

	int readheadcounter=0;

	if (input_csfasta.is_open() && input_qual.is_open()){
		while (getline (input_csfasta, csfasta_line) && getline (input_qual, qual_line)) {
			if (readheadcounter == 0) {
				if (csfasta_line.compare(qual_line) != 0){
					cerr << "\nERROR: csfasta readname " + csfasta_line + " is not equal to the qual readname " + qual_line + ". \n" +
					"Please check if both the csfasta file and qual file has equal number of reads with matching readnames." << endl;
					exit (EXIT_FAILURE);
				} else {
					// when readnames are equal, then print any readname, either from qual or csfasta.
					csfasta_line.erase(0,1);
					cout << "@"+csfasta_line << endl;
					// reset the readcounter to print the sequence and qualites on the next fetch.
					readheadcounter = 1;
				}
			} else {
				cs2bs(csfasta_line);
				cout << baseseq << endl;
				cout << "+" << endl;
				qual2phred33(qual_line);
				cout << qualseq << endl;
				// reset the readcounter to print the readname on the next fetch.
				readheadcounter = 0;
			}
		}

		input_csfasta.close();
		input_qual.close();
	}

	return 0;
}


void qual2phred33 (string qstring) {

	//string input= "31 31 30 31 31 28 31 31 30 31 31 31 31 31 29 30 28 31 31 23 28 28 31 30 31 31 31 31 31 30 31 31 31 31 30 31 31 31 31 31 31 31 31 31 30 31 32 30 31 19 30 28 31 28 28 21 28 13 31 31 31 17 31 30 17 31 26 31 26 17 30 21 27 21 29";
	//string input= "31 31 30 31 -1 28 31 31 30 31 31 31 31 31 29 30 28 31 31 23 28 28 31 30 31 31 31 31 31 30 31 31 31 31 30 31 31 31 31 31 31 31 31 31 30 31 32 30 31 19 30 28 31 28 28 21 28 13 31 31 31 17 31 30 17 31 26 31 26 17 30 21 27 21 29";
	string input = qstring;		// feeding the input from the function
	vector <std::string> quals;	// a vector that will hold the segments of the string after split
	vector <int> quals_int;		// a vector that will hold the quals in the 'integer' format after conversion.

	// splitting the input qual line
	boost::split(quals, input, boost::is_any_of(" "), boost::token_compress_on);

	// When the qualstring is 75, the quals.size is displaying 76 by appending a zero at the end.
	// I suspect if its due to the end of the line character. Therefore, quals.size()-1
	int quals_vec_size = (quals.size()-1);

	int int_conv; // a temporary variable that will hold the integer value

	for (int i=0; i < quals_vec_size; i++) {
		stringstream convert(quals[i]);
		convert >> int_conv;
		quals_int.push_back(int_conv);
	}

/* I have analyzed the distribution of the quality values in the .qual file.
 * No quality value is less than -1. -1 is the lowest quality. Beside wikipedia https://en.wikipedia.org/wiki/FASTQ_format#Color_space ,
 * bwa also adds 33 to the SOLiD quality and then convert to the corresponding ascii code.
 * http://www.tenouk.com/cpluscodesnippet/cplusasciicharintergerrep.html
 */
	quals_vec_size = quals_int.size();

	// NOTE: don't change i < quals_vec_size to <= ; this will add an extra -1 that isn't present in the vector element.

	string qualseq_append; // a local temporary variable that will hold the quality in phred33.

	for (int i=0; i < quals_vec_size; i++) {
		if (quals_int[i] == -1){
			quals_int[i] = 0;
		}

		qualseq_append = (qualseq_append+((char)(quals_int[i]+33)));
	}

	// copy the temporary variable contents to the global variable for the print in the main function.
	qualseq = qualseq_append;
}


void cs2bs (string cstring) {

	std::map<std::string, char> twobase;

	twobase["A0"] = 'A';
	twobase["A1"] = 'C';
	twobase["A2"] = 'G';
	twobase["A3"] = 'T';

	twobase["C1"] = 'A';
	twobase["C0"] = 'C';
	twobase["C3"] = 'G';
	twobase["C2"] = 'T';

	twobase["G2"] = 'A';
	twobase["G3"] = 'C';
	twobase["G0"] = 'G';
	twobase["G1"] = 'T';

	twobase["T3"] = 'A';
	twobase["T2"] = 'C';
	twobase["T1"] = 'G';
	twobase["T0"] = 'T';

	//string colorspace = "T303202210001123002120020110233120030023222011120021101131030321030003311220";
	//string colorspace = "T30320221000.123002120020110233120030023222011120021101131030321030003311220";

	string colorspace = cstring;
	string basespace = colorspace;
	int baselength = colorspace.length();
	using boost::lexical_cast;

	for (int i=1; i<=baselength; i++){
		if (colorspace[i] == '.'){
			for(int j=i; j<=baselength; j++) {
				basespace[j] = 'N';
			}
			break;
		}
		// colorbase = "T3"
		string colorbase = boost::lexical_cast<std::string>(basespace[(i-1)]) + boost::lexical_cast<std::string>(colorspace[i]);
		basespace[i]=(*twobase.find(colorbase)).second;
	}

	//cout << colorspace << endl;
	basespace.erase(0,1);

	// copy the decoded sequence content from loca variable to global variable.
	baseseq=basespace;
}
