#!/bin/sh
VERSION=2.10.1
LICENSE="Copyright (C) 2003-2007, 2009, 2011-2013 Dimitar Ivanov

License: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law."
################################################################################
#
# maphimbu - all purpose histogram builder for numerical and text data in 1-d
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################
#
MYNAME=`basename $0`
DX=0
XAX=1
YAX=2
CX=0
SU=0
MV=0
PFORM="%d"
SORT=cat
SORT_OPT="-"
RC=0

### We need an AWK supporting assignments
exec 3>&2 2>&-
for a in gawk nawk awk
do
  [ "`echo |$a -v a=a '{}' 2>&1`" = "" ] && AWK=$a
done
exec 2>&3
[ "x$AWK" = x ] && \
  echo "Error: can't find 'awk' supporting assignments" && \
  exit 2

### Do sort correctly any data including numbers
#
_sort_data_on_demand_()
{
   if [ x$SORT_SEP = x ]; then
        $SORT $SORT_OPT
   else
            # Prepare the data in order to sort correctly floats, then do sort,
            # and finally remove the first column used for numerical sorting
        $AWK "{ printf(\"%.16f $SORT_SEP%s\n\", \$$1, \$0) }" \
        | $SORT $SORT_OPT \
        | cut -d$SORT_SEP -f2-
   fi
}

_acc_result_()
{
  [ $ACC_PROG ] || { cat ; return 0; }
  $ACC_PROG -S -y $ACC_COL
}

### Show usage
#
show_usage()
{
d=$2
sep=`echo |awk '{printf( "%080s", 0 )}' |tr 0 "$2"`

cat << !EOH
$sep
$1
$sep

Usage: $MYNAME [OPTION]... [FILE] 

Options:
   -x <int>       $d x-data column (default is $XAX)
   -y <int>       $d y-data column (default is $YAX);
                    It is considered only when combined with '-m' or '-S'
   -d <float|int> $d delta size (dx-resolution) in case of numerical data;
                    Keep in mind that 2 is not the same as 2.0: in the
                    first case the numbers are treated as integers, in
                    the latter as floats.
   -n             $d normalize the result
                    A) non-numerical data: n(x) = counts(x)/sum_of_counts
                    B) numerical data: the normalized probability density func.
                       is P(-INF < x <INF) = Int_[-INF:INF] P(x)*dx = 1 ;
                       For the numerical integration an open x-data interval
                       is assumed.
   -N             $d normalize data against the total sum of counts
                    (same as option '-n' with case A forced)
   -U             $d normalize the result with x-range mapped to unit;
                    The x-values are mapped to the interval [0:1],
                    respectively Int_[0:1] P(x)*dx = 1; This option is
                    mutually exclusive with '-n'.
   -S             $d compute sums and normalized sums of y-values;
                    The total sum of all y-values is used for the 
                    normalization.
   -m             $d compute the arithmetic average of the y-values
                    in the underlying x-bin
   -a             $d compute accumulated sums of the result data column
                    (default is colum 2, or 3 in case of normalization);
                    The program 'mintegrate' is need in this case.
   -s <int>       $d sort output using the specified column;
                    Header is skipped in this case.
   -g <int>       $d sort output numerically using the specified column;
                    Header is skipped in this case.
   -r             $d do reverse sorting (in combination with '-g' and '-s')
   -C             $d center histogram bins (shifts x-axis by 0.5*dx)
   -X             $d treat hole line as one string (single data record)
   -H             $d print a data description header
   -V             $d print program version and exit
   --version      $d output version and copyright information
   --help         $d display help
   -h             $d display short help (options summary)


Note: this program is not intended for use in high order numerical computations.


Report bugs to <gnu@mirendom.net>

!EOH
}

### Show version
#
show_version()
{
     cat << !ver
$MYNAME $VERSION
$LICENSE
!ver
}

[ "$1" = "--version" ] && show_version && exit
[ "$1" = "--help" ] && show_usage "This program produces 1-d histograms from numerical or text data input. It can be also used to estimate the probability distribution function of a numerical variable - see http://en.wikipedia.org/wiki/Probability_density_function." " " && exit

################################################################################
#
# MAIN
#
opts="aChHnNUmd:x:Xy:Vrs:g:S"
opts=`getopt $opts $*` || { opts="-h"; RC=1; }
set -- $opts

while [ $# -gt 0 ]
do
   case $1 in
        -h) show_usage "$MYNAME $VERSION: histogram builder for 1-d numerical and text data" "-" |egrep "^($MYNAME|Usage:|Options:|Note:|\ *-)"
            exit $RC
            ;;
        -d) DX=$2  
               # Try to find out the floating point separator
            FSEP="`echo $DX |tr -d '[0-9]'`"
            [ -n "$FSEP" ] && FLOAT="0${FSEP}5"
            shift 2
            ;;
        -n) NORM=1
            UN_HEAD="NORMALIZED_COUNTS"
            ACC_COL=3
            shift 1
            ;;
        -N) NORM=2
            UN_HEAD="NORMALIZED_COUNTS"
            ACC_COL=3
            shift 1
            ;;
        -U) NORM=3
            UN_HEAD="UNIT_X dx=%-.5g        UNIT_COUNTS"
            shift 1
            ;;
        -a) [ $ACC_COL ] || ACC_COL=2
            ACC_PROG=mintegrate
            which $ACC_PROG 2>&1 |grep -q "^\/" \
            || { 
                 echo "Error: can't find program '$ACC_PROG' in PATH"
                 exit 3
               }
            shift 1
            ;;
        -x) XAX=$2
            shift 2
            ;;
        -X) XAX=0
            shift 1
            ;;
        -y) YAX=$2
            shift 2
            ;;
        -m) MV=1
            MV_HEAD="   Y_MEAN_VALUE   Y_STD_DEV"
            shift 1
            ;;
        -S) SU=1
            MV_HEAD="   Y_SUM   NORMALIZED_Y_SUM"
            shift 1
            ;;
        -g) SORT=sort
            SORT_OPT="-nk1 $SORT_OPT"
            SORT_COL="$2"
            SORT_SEP=:
            shift 2
            ;;
        -s) SORT=sort
            SORT_OPT="-nk$2 $SORT_OPT"
            shift 2
            ;;
        -r) SORT=sort
            SORT_OPT="-r $SORT_OPT"
            shift 1
            ;;
        -C) CX=0.5
            shift 1
            ;;
        -H) HEAD=1
            shift 1
            ;;
        -V) echo $VERSION
            exit
            ;;
        --) shift 1
            break
            ;;
   esac
done

   # Remove header if sorting requested
[ $SORT != cat ] && { HEAD= ; UN_HEAD= ; MV_HEAD= ; }

### MAIN ###
#
cat $1 |$AWK -v dx=$DX -v x=$XAX -v y=$YAX -v cx=$CX -v su=$SU -v mv=$MV \
             -v head=$HEAD -v unhead="$UN_HEAD" -v mvhead="$MV_HEAD" \
             -v norm=$NORM -v float=$FLOAT \
'BEGIN \
{
    CONVFMT = "%.17g"
    OFMT    = "%.17g"
    if( dx != 0 ) { one_dx=1/dx }
    else          { one_dx=1      }
}
{
    bin=$x
    if( dx != 0 ) {
           # Round up for negative and positive numbers is different
        if( bin >= 0 ) half=+float
        else           half=-float
           # Round up and take the integer part
        bin=sprintf( "%d", (bin*one_dx)+half )
           # Multiplication by 1 is necessary to attain '-0' to be simply '0'
        bin=1*bin
    }
    hits[bin]++
    if( hits[bin] == 1 ) { nrh++; name[nrh]=bin }

       # Mean values or y-sums
    if( mv || su ) { ysum[bin]+=$y ; ysum2[bin]+=($y*$y) ; ysumT+=$y }

       # Check for min and max x-value if normalization chosen:
       # min and max are INTEGERS if dx!=0
    if( norm ) {
        if( ! counts )       { max=min=bin }
        if( bin > max )      { max=bin }
        else if( bin < min ) { min=bin }
    }

    counts++
}
END \
{
    if( !counts ) exit

        # On request normalize only against the number of counts - ignore dx!
    if( norm == 2 ) { one_dx=1 }
    one_sum=one_dx/counts

    if( mv || su ) { one_ysumT=one_dx/ysumT }

       # Calculate the width of the histogram (x-range)
    if( norm == 3 ) {
        if( dx == 0 ) { dxl=1  }
        else          { dxl=dx }
        x0=min*dxl
           # Found out the width of the x-range
        xrange=(max-min)*dxl
        if( xrange == 0 ) xrange=1
        one_xrange=1/xrange
           # Now rescale dxl by the x-data width
        dxl=dxl*one_xrange
           # Rescale also one_sum=1/dx/counts
        one_sum=one_sum*xrange
           # Evaluated the header to print out the rescaled dx
        unhead = sprintf( unhead, dxl );
    }
       # Print data header
    if( head ) {
        if( unhead != "" )
            printf("#%17s   %15s   %17s%24s\n", "X", "COUNTS", unhead, mvhead)
        else
            printf("#%17s   %15s   %24s\n", "X", "COUNTS", mvhead)
    }
       # Main data loop
    while( nrh > 0 ) {
           x_i=bin=name[nrh]
               # Shift x (possible only with numerical data)
           if( cx )      { x_i=sprintf( "%15.15g", bin+cx ) }
               # Normalize for dx!=0
           if( dx != 0 ) { x_val=sprintf( "%15.15g", x_i*dx ) }
           else          { x_val=x_i }
               # Print x-value and its count
           printf( "%18s   %15d", x_val, hits[bin] )
               # Print normalized x-value as mapped in the [0-1] range
           if( norm == 3 ) {
               printf( "   %15.15f", (x_val-x0)*one_xrange )
           }
               # Print an additional column with normalized histogram data
           if( norm ) {
               printf( "   %15.15f", hits[bin]*one_sum )
           }
               # Compute averages/sums and print additional columns
           if( mv || su ) {
               if( mv ) {
                      # Mean value of the y-variable: the used computation
                      # method can introduce large round-off errors
                   mean=ysum[bin]/hits[bin]
                   mean2=mean*mean
                   sigma2=ysum2[bin]/hits[bin]-mean2
                   sigma=sqrt(sigma2)
               } else {
                      # Normalized sum of y-values
                   mean=ysum[bin]
                   sigma=one_ysumT*mean
               }
               printf( "   %15.15g   %g", mean, sigma )
           }
           printf("\n")
           nrh--
    }
}' | _sort_data_on_demand_ $SORT_COL | _acc_result_

exit 0
