#!/usr/bin/python3

import os
import sys
import subprocess
import argparse
import datetime
import time

# run command
# python3 CAZy_CDS_database.py -i CAZyDB.07312020.fa -o CAZyDB.07312020.CDS_nucleotide_format.fa
# python3 CAZy_CDS_database.py -i need_updated_ids.fa -o CAZyDB.07312020.CDS_nucleotide_format.fa


parser = argparse.ArgumentParser(description="From a file with GenBank protein ids this script creates a file with these proteins' CDS in nucleotide format using the NCBI API. The errors ocurring when using NCBI API will be stored in the log file this script creates.\n")

parser.add_argument('-i', '--input',
                        dest = "input",
                        action = "store",
                        default = None,
                        help = "Input fasta file.\n")

parser.add_argument('-o', '--output_file',
                        dest = "output_file",
                        action = "store",
                        default = "",
                        help = "Output fasta file name.\n")
options=parser.parse_args()


def FASTA_iterator( fasta_filename ):
    """
    Yields identifier and sequences from a fasta file.
    """
    fd=open(fasta_filename,"r")
    seq=""
    for line in fd:
        line=line.strip("\n")
        if line.startswith(">"):
            if seq:
                yield (identifier,seq)
                seq=""
            identifier=line #.strip(">")
        else:
            seq=seq+line
    yield (identifier,seq)

#>lcl|CP073567.2_cds_QUJ54615.1_1 [locus_tag=KEA48_00205] [protein=LysM peptidoglycan-binding domain-containing protein] [protein_id=QUJ54615.1] [location=49511..50443] [gbkey=CDS]

def get_protein_id(stn):
    stn = stn.split()
    for i in stn:
        if "protein_id" in i:
            protid  = i.split("=")[-1].strip("[]")
            return protid

if __name__=='__main__':
    fasta_file = options.input
    output_file = options.output_file
    switcher = False

    ct=str(datetime.datetime.now())
    ct=ct.split(".")[0]
    ct="_".join(ct.split(" "))

    logfilename="log_cazy_nucleotide_creation"+ct

    already_translated = []
    for identifier, seq in FASTA_iterator(output_file):
        freeid = identifier.lstrip(">")
        freeid = freeid.split("|")[0]
        freeid = get_protein_id(identifier)
        already_translated.append(freeid)
        #print (freeid)


    for identifier, seq in FASTA_iterator(fasta_file):
        identifier = identifier.lstrip(">")
        identifier = identifier.split("|")[0]
        #print(identifier)

        if identifier not in already_translated:
            print (identifier)
            if identifier != "sp": # a human sequence starting with sp| was breaking the script
                print(">>>>> "+identifier)
                prev_command = "printf '\n ---" + identifier + "\n' >> " + logfilename
                subprocess.call(prev_command, shell=True)
                command = "esearch -db protein -query " + identifier + " | efetch -format fasta_cds_na >> " + output_file + " 2>> " + logfilename
                subprocess.call(command, shell=True)
                time.sleep(0.11)