#! /usr/bin/perl -w
#In this version I combined similar methods into one aggregated score
###################################################################################
# Author      : Panos
# Date        : 04/27/2020
# Description : Predict the impact of variants using EA and other methods
# Usage       : MetaEA.pl 
# USES        :
# USED BY     : 
###################################################################################
use strict;
use warnings;
STDOUT->autoflush(1);

if(!$ARGV[1]){
    print "Please, provide the location of the input files and the gene identifier as:\n
    perl MetaEA_submit_to_NatComm.pl location_of_directory Gene_ID\n"; 
    exit;
};

my $home_dir=$ARGV[0]; if (!-d $home_dir){print "No input directory was found at:\n$home_dir\n"; exit;};
if($home_dir=~/\/$/){chop $home_dir};

#subroutines
my $pred_average="$home_dir/code/pred_average.pl"; 
my $pred_average_with_EA="$home_dir/code/pred_average_with_EA.pl";
my $scoring="$home_dir/code/score_file2cov_file.pl";
if (!-e $pred_average){print "pred_average.pl file is missing at:\n$home_dir/code\n"; exit;};
if (!-e $pred_average_with_EA){print "pred_average_with_EA.pl file is missing at:\n$home_dir/code\n"; exit;};
if (!-e $scoring){print "score_file2cov_file.pl file is missing at:\n$home_dir/code\n"; exit;};


#input
my $gene=$ARGV[1]; 
my $mpred_location="$home_dir/$gene"; if (!-d $mpred_location){print "No methods directory was found at:\n$mpred_location\n"; exit;};
my $new_file=$gene.".MetaEA";
my $ea_file="$mpred_location/$gene.EA";  if (!-e $ea_file){print "No EA file was found for $gene\n$ea_file\n"; exit;};
my %predictor;    my $m;
#________________________________________________________________________________________________________________________
# 1. Register the available predictors
#________________________________________________________________________________________________________________________

$m="MutPred";          $predictor{$m}="MutPred_dbNSFP4.0a";
$m="MutationAssessor"; $predictor{$m}="MutationAssessor_dbNSFP4.0a";
$m="SIFT4G";           $predictor{$m}="SIFT4G_dbNSFP4.0a";
$m="SIFT";             $predictor{$m}="SIFT_dbNSFP4.0a";
$m="SIFTsame";         $predictor{$m}="SIFT-same_dbNSFP2.9.1";
$m="PROVEAN";          $predictor{$m}="PROVEAN_dbNSFP4.0a";
$m="DEOGEN2";          $predictor{$m}="DEOGEN2_dbNSFP4.0a";
$m="VEST3";            $predictor{$m}="VEST3_dbNSFP3.5a";
$m="VEST4";            $predictor{$m}="VEST4_dbNSFP4.0a";
$m="REVEL";            $predictor{$m}="REVEL_dbNSFP4.0a";
$m="Polyphen2_HDIV";   $predictor{$m}="Polyphen2_HDIV_dbNSFP4.0a";
$m="Polyphen2_HVAR";   $predictor{$m}="Polyphen2_HVAR_dbNSFP4.0a";
$m="Eigen";            $predictor{$m}="Eigen-raw_dbNSFP4.0a";
$m="EigenPC";          $predictor{$m}="Eigen-PC-raw_dbNSFP4.0a";
$m="CADD";             $predictor{$m}="CADD_raw_dbNSFP4.0a";
$m="CADD13";           $predictor{$m}="CADD_raw-v1.3_dbNSFP3.5a";
$m="CADD12";           $predictor{$m}="CADD_raw-v1.2_dbNSFP2.9.1";
$m="MPC";              $predictor{$m}="MPC_dbNSFP4.0a";
$m="MetaLR";           $predictor{$m}="MetaLR_dbNSFP4.0a";
$m="MetaSVM";          $predictor{$m}="MetaSVM_dbNSFP4.0a";
$m="PrimateAI";        $predictor{$m}="PrimateAI_dbNSFP4.0a";
$m="MVP";              $predictor{$m}="MVP_dbNSFP4.0a";
$m="MutationTaster1";  $predictor{$m}="MutationTaster-v1_dbNSFP2.9.1";
$m="MutationTaster";   $predictor{$m}="MutationTaster_dbNSFP4.0a";
$m="MCAP1";            $predictor{$m}="M-CAP-v1.0_dbNSFP3.5a";
$m="MCAP";             $predictor{$m}="M-CAP_dbNSFP4.0a";
$m="LRT";              $predictor{$m}="LRT_dbNSFP4.0a";
$m="FATHMM";           $predictor{$m}="FATHMM_dbNSFP4.0a";
$m="DANN";             $predictor{$m}="DANN_dbNSFP4.0a";
$m="FATHMM_MKL";       $predictor{$m}="fathmm-MKL_dbNSFP4.0a";
$m="FATHMM_XF";        $predictor{$m}="fathmm-XF_dbNSFP4.0a";
#
#________________________________________________________________________________________________________________________
# 2. Remove unavailable predictors and find the pcc for the remaining ones (accounting for similarities)
#________________________________________________________________________________________________________________________
## 2.1 Read the pred file of EA
my %EApred;
open (FILE, "$ea_file") || die ("Could not find the EA file: $gene.pred");
while(<FILE>){
    next if ($_!~/\w/); $_=~s/\n//;
    my @a=split(/\t/,$_);
    $EApred{$a[0]}=$a[-1]/100;
}
%EApred=cov(\%EApred);
#
## 2.2 Read the pred files of all remaining predictors and calculate pcc to ea
my %pred; my %pcc;
foreach my $m ( keys %predictor) {
    my $file="$mpred_location/$gene.$predictor{$m}";
    if (!-e "$file"){delete $predictor{$m}; next; };
    my $lines=0; my $data=0; my %mpred;
    open (FILE, "$file") || die ("Could not find the file: $file");
    while(<FILE>){
        next if ($_!~/\w/); $_=~s/\n//; $lines++;
        my @a=split(/\t/,$_);
        next if($a[-1]!~/\d/); $data++;
        my $key="$m;$a[0]";
        $pred{$key}=$a[-1];
        $mpred{$a[0]}=$a[-1];
    } close FILE;
    if (!$data){delete $predictor{$m}; next; };
    if ($data<0.5*$lines){delete $predictor{$m}; next; };
    %mpred=cov(\%mpred);
    $pcc{$m}=pcc(\%EApred, \%mpred);
}


## 2.3  
my $best_pcc=0; my %integral_pred; my %weight;
### First round
#### Add the top method
my $run_top_pcc=0; my $run_top_m="";
foreach my $m ( keys %predictor) {
    if($run_top_pcc<$pcc{$m}){$run_top_pcc=$pcc{$m}; $run_top_m=$m; };
}
#### Get the top method's scores
foreach my $v ( keys %EApred) {
    my $key="$run_top_m;$v";
    my $pr="-"; if(defined $pred{$key}){ if($pred{$key}=~/\d/){$pr=$pred{$key}} };
    $integral_pred{$v}=$pr;
}
%integral_pred=cov(\%integral_pred);
$best_pcc=$run_top_pcc; $run_top_pcc=1;
$weight{$run_top_m}=1;

# loop to add more methods to increase PCC
my $factor=1;

while($run_top_pcc>$best_pcc){
    $run_top_pcc=0;$run_top_m="";
    my $integrals=0; foreach my $m ( keys %predictor) { if(!$weight{$m}){$weight{$m}=0}; $integrals=$integrals+$weight{$m}};
    foreach my $m ( keys %predictor) {
        my %test_integral_pred;
        foreach my $v ( keys %EApred) {
            my $key="$m;$v"; next if (!defined $pred{$key});
            if($pred{$key}=~/\d/ and $integral_pred{$v}=~/\d/){$test_integral_pred{$v}=($integrals*$integral_pred{$v}+$factor*$pred{$key})/($integrals+$factor)};
            if($pred{$key}=~/\d/ and $integral_pred{$v}!~/\d/){$test_integral_pred{$v}=$pred{$key}};
        }
        %test_integral_pred=cov(\%test_integral_pred);
        my $pcc_run=pcc(\%EApred, \%test_integral_pred);
        if($run_top_pcc<$pcc_run){$run_top_pcc=$pcc_run; $run_top_m=$m; };
    }
    print "$factor*$run_top_m is $run_top_pcc vs $best_pcc\n";
    if($run_top_pcc>$best_pcc){
        foreach my $v ( keys %EApred) {
            my $key="$run_top_m;$v"; next if (!defined $pred{$key});
            if($pred{$key}=~/\d/ and $integral_pred{$v}=~/\d/){$integral_pred{$v}=($integrals*$integral_pred{$v}+$factor*$pred{$key})/($integrals+$factor)};
            if($pred{$key}=~/\d/ and $integral_pred{$v}!~/\d/){$integral_pred{$v}=$pred{$key}};
        }
        $best_pcc=$run_top_pcc; $run_top_pcc=1;
        if(!$weight{$run_top_m}){$weight{$run_top_m}=0}; $weight{$run_top_m}=$weight{$run_top_m}+$factor;
    }else{
        $factor=$factor-0.25;
        if($factor<0.1){$run_top_pcc=0}else{$run_top_pcc=1};
    }
}
    
my $methods_used=""; $weight{"EA"}=1; foreach my $m ( sort { $weight{$b} <=> $weight{$a} } keys %weight) {my $w=$weight{$m}; next if (!$w); $w=1*$w; $methods_used=$methods_used." $w"."x$m +"}; chop $methods_used;

my $integrals=0; foreach my $m ( keys %predictor) { if(!$weight{$m}){$weight{$m}=0}; $integrals=$integrals+$weight{$m}};
my %score;
foreach my $v (keys %EApred) {
    next if (!defined $integral_pred{$v}); next if ($integral_pred{$v}!~/\d/);
    $score{$v}=($EApred{$v}+$integrals*$integral_pred{$v})/(1+$integrals);
}

my %fresi_n; my %fresi_e; my %fresi_b; 	# if variant was found in other predictors do residue stats
foreach my $v (keys %EApred) {
    my $resi=substr($v,1,-1); 
    if(!$fresi_n{$resi}){$fresi_n{$resi}=0; $fresi_b{$resi}=0; $fresi_e{$resi}=0; };
    if(exists $score{$v}){
        $fresi_n{$resi}++;
        $fresi_e{$resi}=$fresi_e{$resi}+$EApred{$v};
        $fresi_b{$resi}=$fresi_b{$resi}+$score{$v};
    }
}
    
foreach my $v (keys %EApred) {	# if variant was NOT found in other predictors project the changes
    next if (exists $score{$v});
    my $resi=substr($v,1,-1); if (!$fresi_n{$resi}){$score{$v}=sprintf "%.3f", $EApred{$v}; next;};
    my $av= $fresi_b{$resi}/$fresi_n{$resi} - $fresi_e{$resi}/$fresi_n{$resi};
    $score{$v}=$EApred{$v}+$av;
}

open (O1, ">$new_file.scores") || die ("Could not open file: new"); print O1 "# Average of:$methods_used\n#Variant\tEA\tOthers_av\tScore\tCoverage\n";
my $res=0;
foreach my $var (sort { substr($a,1,-1)<=>substr($b,1,-1) || substr($a,-1) cmp substr($b,-1) } keys %EApred) {
    my $resi=substr($var,1,-1); 
    if($res!=$resi){print O1 "\n"; $res=$resi};
    if(!$integral_pred{$var}){$integral_pred{$var}="."};
    $EApred{$var}=sprintf "%.2f", 100*$EApred{$var};
    if($integral_pred{$var}=~/\d/){$integral_pred{$var}=sprintf "%.2f", 100*$integral_pred{$var}};
    $score{$var}=sprintf "%.2f", 100*$score{$var};
    print O1 "$var\t$EApred{$var}\t$integral_pred{$var}\t$score{$var}\n";
} close O1;
`$scoring $new_file.scores 4`; `rm $new_file.scores`; `mv $new_file.cov $new_file`;

##
#

exit;

sub pcc {
    my %e = %{shift()};
    my %p = %{shift()};
############################### MAKE THEM Coverage before PCC !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    my %subs; my $allv=0;
    foreach my $v ( keys %e) {
        if(defined $p{$v}){ if($p{$v}=~/\d/ and $e{$v}=~/\d/){$subs{$v}=1} };
        $allv++;
    }
    
    my %x; my %y; my $cnt=0; my $sx=0; my $sy=0;
    foreach my $v ( keys %subs) {
        my $v1=$e{$v}; my $v2=$p{$v};
        $x{$cnt}=$v1;
        $y{$cnt}=$v2;
        $cnt++;
        $sx=$sx+$v1;
        $sy=$sy+$v2;
    }
    if($cnt<0.1*$allv){ return "Few_or_No_data" };
    my $ax=$sx/$cnt;
    my $ay=$sy/$cnt;
    my $sprod=0; my $sx2=0; my $sy2=0;
    foreach my $c (keys %x) {
        $sprod=$sprod+($x{$c}-$ax)*($y{$c}-$ay);
        $sx2=$sx2+($x{$c}-$ax)*($x{$c}-$ax);
        $sy2=$sy2+($y{$c}-$ay)*($y{$c}-$ay);
    }
    if(!$sx2 or !$sy2){ return "Few_or_No_data"};
    my $c=$sprod/sqrt($sx2*$sy2)+0.000005;
    my $t=7; if($c<0){$t=8}; $c=substr($c,0,$t);
    return $c;
}

sub cov {
    my %e = %{shift()};
    my @r; my $cnt=0; my %numb;
    #print "$e{'M1I'} ";
    foreach my $v (keys %e) {
        my $c=$e{$v};
        next if($c!~/\d/);
        if ($numb{$c}){$numb{$c}++}else{$numb{$c}=1; push(@r,$c); };
        $cnt++;
    }
    my @sorted = sort { $a <=> $b } @r;

    my $cur=0; my %covofrank;
    foreach my $value (@sorted){
        $cur=$cur+$numb{$value};
        my $cover=100*$cur/$cnt;if($cover>100){print "$cover for $value\n$cur / $cnt\n\n"; exit;};
        while($cover=~/\./ and substr($cover,0,-3)=~/\./) {$cover=substr($cover,0,-1)};
        $covofrank{$value}=$cover;
    }

    foreach my $v (keys %e) {
        my $c=$e{$v};
        if($covofrank{$c}){
            $e{$v}=$covofrank{$c}/100;
        };
    }
    return %e;
}
