;+
; NAME:
;    ks_twosample
;
; PURPOSE:
;    This function performs the two-sample Kolmogorov-Smirnov test.
;
; CATEGORY:
;    Statistics
;
; CALLING SEQUENCE:
;    result = ks_twosample( sample_data_1, sample_data_2 )
;
; INPUT:
;    SAMPLE_DATA_1:  A required float vector of length N_DATA_1 containing the 
;        data from the first sample.
;    SAMPLE_DATA_2:  A required float vector of length N_DATA_2 containing the 
;        data from the second sample.
;
; KEYWORD PARAMETERS:
;    DOUBLE:  If set, the calculations are performed in double precision.  The 
;        default is single precision.
;    METHOD:  A required scalar string specifying the method.  Supported values 
;        are:
;        * 'asymptotic smirnov':  The statistics are calculated using an 
;          asymptotic approximation of the Smirnov (1939) calculation valid for 
;          large sample sizes.
;        * 'exact':  The statistics are calculated by comparing the calculated 
;          K-S statistic against all possible pathways to that statistic given 
;          the sample sizes.
;        * 'exact weighted':  The statistics are calculated by comparing the 
;          calculated K-S statistic against all possible pathways to that 
;          statistic given the sample sizes, weighted such that all locations 
;          in the distribution have equal opportunity to produce the K-S 
;          statistic (instead of the usual bias toward the centre).
;
; OUTPUT:
;    RESULT:  A float scalar returning the signficance level from the 
;        Kolmogorov-Smirnoff test.
;
; USES:
;    threshold_quantile.pro
;
; PROCEDURE:
;    This function compares the distribution of the data in SAMPLE_DATA_2 with 
;    the data in SAMPLE_DATA_1.  Various methods are implemented, as described 
;    within the code.
;
; REFERENCES:
;    Wilcox, R. R.  2005.  Introduction to robust estimation and hypothesis 
;      testing.  Second Edition.  Elsevier Academic Press.  ISBN: 0-12-751542-9
;
; EXAMPLE:
;    ; Evaluate whether two vectors from the Gaussian random number generator 
;    ; are inconsistent with each other using the exact method.
;    data_1 = randomn( 1, 100 )
;    data_2 = randomn( 2, 100 )
;    print, ks_twosample( data_1, data_2, method='exact' )
;    ; Because the underlying distributions of the two samples are identical, 
;    ; there is a 10% chance that the result will be 0.10 or lower, etc.
;
; MODIFICATION HISTORY:
;    Written by:  Daithi A. Stone (dastone@runbox.com), 2025-03-07
;-

;***********************************************************************

FUNCTION KS_TWOSAMPLE, $
    SAMPLE_DATA_1, SAMPLE_DATA_2, $
    METHOD=method, $
    DOUBLE=double_opt, $
    TIES=ties_opt

;***********************************************************************
; Constants and options

; Confirm inputs
n_data_1 = n_elements( sample_data_1 )
if n_data_1 eq 0 then stop
n_data_2 = n_elements( sample_data_2 )
if n_data_2 eq 0 then stop

; Ensure a method is specified and supported
if n_elements( method ) ne 1 then stop
method = strlowcase( method )
temp = [ 'asymptotic smirnov', 'exact', 'exact weighted' ]
if max( method eq temp ) ne 1 then stop
; Restrict usage of the exact weighted distribution to n_data ratios within 3.
; Tests indicate that it performs very poorly with higher ratios.
if method eq 'exact weighted' then begin
  temp = float( n_data_1 ) / n_data_2
  if ( temp lt 1. / 3. ) or ( temp gt 3. ) then stop
endif

; Option for double precision
double_opt = keyword_set( double_opt )
if double_opt eq 1 then begin
  one = 1.d
endif else begin
  one = 1.
endelse

; Option to account for the possibility of ties
ties_opt = keyword_set( ties_opt )
if ( ties_opt eq 1 ) and ( method ne 'exact' ) then begin
  temp = "ks_twosample.pro:  " $
      + "Warning, TIES option only permitted when METHOD='exact'"
  print, temp
endif

;***********************************************************************
; Define cumulative distribution functions

; Calculate the empirical cumulative distribution function for the first data 
; sample
id_sort = sort( sample_data_1 )
cdf_1 = sample_data_1[id_sort]
if double_opt eq 1 then begin
  quant_1 = dindgen( n_data_1 ) / n_data_1
endif else begin
  quant_1 = findgen( n_data_1 ) / n_data_1
endelse
; Calculate the empirical cumulative distribution function for the second data 
; sample
id_sort = sort( sample_data_2 )
cdf_2 = sample_data_2[id_sort]
if double_opt eq 1 then begin
  quant_2 = dindgen( n_data_2 ) / n_data_2
endif else begin
  quant_2 = findgen( n_data_2 ) / n_data_2
endelse

; Interpolate the ECDF of the second sample to the quantiles of the first sample
if double_opt eq 1 then begin
  quant_2_to_1 = threshold_quantile( double( cdf_2 ), double( cdf_1 ), $
      presorted=1 )
endif else begin
  quant_2_to_1 = threshold_quantile( cdf_2, cdf_1, presorted=1 )
endelse
; Interpolate the ECDF of the first sample to the quantiles of the second sample
if double_opt eq 1 then begin
  quant_1_to_2 = threshold_quantile( double( cdf_1 ), double( cdf_2 ), $
      presorted=1 )
endif else begin
  quant_1_to_2 = threshold_quantile( cdf_1, cdf_2, presorted=1 )
endelse

; Pool the estimated CDFs and interpolated estimated CDFs from the two samples
quant_to_1 = [ quant_1, quant_1_to_2 ]
quant_to_2 = [ quant_2_to_1, quant_2 ]
; For dealing with ties we need an ordered version of cdf_12
if ties_opt eq 1 then begin
  cdf_12 = [ cdf_1, cdf_2 ]
  cdf_12_sorted = cdf_12[sort(cdf_12)]
endif


;***********************************************************************
; Perform the K-S test

; Calculate the K-S statistic
if double_opt eq 1 then begin
  ks_stat = max( abs( quant_to_1 - double( quant_to_2 ) ) )
endif else begin
  ks_stat = max( abs( quant_to_1 - quant_to_2 ) )
endelse

; If we are using the asymptotic Smirnov solution
if method eq 'asymptotic smirnov' then begin
  ; Calculate the p-value
  temp_stat = ks_stat $
      * sqrt( one * n_data_1 * n_data_2 / ( one * n_data_1 + n_data_2 ) )
  p_value = 2. * exp( -2. * ( temp_stat ^ 2. ) )
; For the exact method
endif else if method eq 'exact' then begin
  ; Calculate the total number of paths that could lead to a K-S statistic 
  ; value less than or equal to the one we have
  cij = fltarr( n_data_1 + 1, n_data_2 + 1 )
  for i_2 = 0, n_data_2 do begin
    temp_2 = float( i_2 ) / n_data_2
    for i_1 = 0, n_data_1 do begin
      cij[i_1,i_2] = abs( float( i_1 ) / n_data_1 - temp_2 ) le ks_stat
    endfor
  endfor
  if ties_opt eq 1 then begin
    for i_2 = 0, n_data_2 do begin
      for i_1 = 0, n_data_1 do begin
        if ( i_1 + i_2 ) lt ( n_data_1 + n_data_2 ) then begin
          if cdf_12[i_1+i_2] eq cdf_12[i_1+i_2+1] then cij[i_1,i_2] = 1.
        endif
      endfor
    endfor
  endif
  nij = dblarr( n_data_1 + 1, n_data_2 + 1 )
  nij[0,*] = cij[0,*]
  nij[*,0] = cij[*,0]
  for i_2 = 1, n_data_2 do begin
    for i_1 = 1, n_data_1 do begin
      nij[i_1,i_2] = cij[i_1,i_2] * ( nij[i_1,i_2-1] + nij[i_1-1,i_2] )
    endfor
  endfor
  ; Calculate the p-value
  ;p_value = factorial( n_data_1 ) * factorial( n_data_2 ) $
  ;    * nij[n_data_1+1,n_data_2+1] / factorial( n_data_1 + n_data_2 )
  ; Calculate the p-value using logarithms to allow large factorials
  log_p_value = total( alog10( dindgen( n_data_1 ) + 1. ) ) $
      + total( alog10( dindgen( n_data_2 ) + 1. ) ) $
      + alog10( nij[n_data_1,n_data_2] ) $
      - total( alog10( dindgen( n_data_1 + n_data_2 ) + 1. ) )
  p_value = 10. ^ log_p_Value
  if double_opt eq 0 then p_value = float( p_value )
  p_value = 1. - p_value
; For the exact weighted method
endif else if method eq 'exact weighted' then begin
  ; Calculate the weighted KS statistic (over-riding ks_stat)
  temp = quant_to_1 / ( 1. + one * n_data_2 / n_data_1 ) $
      + ( 1. - 1. / ( 1. + one * n_data_2 / n_data_1 ) ) * quant_to_2
  temp = temp * ( 1. - temp )
  id = where( temp gt 0 )
  temp = sqrt( one * n_data_1 * n_data_2 / ( n_data_1 + n_data_2 ) ) $
      * abs( quant_to_1[id] - quant_to_2[id] ) / sqrt( temp[id] )
  ks_stat = max( temp )
  temp = 0
  ; Calculate the total number of paths that could lead to a K-S statistic 
  ; value less than or equal to the one we have
  cij = fltarr( n_data_1 + 1, n_data_2 + 1 )
  for i_2 = 0, n_data_2 do begin
    for i_1 = 0, n_data_1 do begin
      temp = sqrt( one * n_data_1 * n_data_2 / ( n_data_1 + n_data_2 ) ) $
          * abs( float( i_1 ) / n_data_1 - float( i_2 ) / n_data_2 ) $
          / sqrt( ( one * i_1 + i_2 ) / ( n_data_1 + n_data_2 ) $
          * ( 1. - ( one * i_1 + i_2 ) / ( n_data_1 + n_data_2 ) ) )
      cij[i_1,i_2] = temp le ks_stat
    endfor
  endfor
  i_1 = 0.01  ; The limit as i_1->0
  i_2 = 0.01  ; The limit as i_2->0
  temp = sqrt( one * n_data_1 * n_data_2 / ( n_data_1 + n_data_2 ) ) $
       * abs( float( i_1 ) / n_data_1 - float( i_2 ) / n_data_2 ) $
       / sqrt( ( one * i_1 + i_2 ) / ( n_data_1 + n_data_2 ) $
       * ( 1. - ( one * i_1 + i_2 ) / ( n_data_1 + n_data_2 ) ) )
  cij[0,0] = temp le ks_stat
  i_1 = n_data_1 - 0.01  ; The limit as i_1->n_data_1
  i_2 = n_data_2 - 0.01  ; The limit as i_1->n_data_2
  temp = sqrt( one * n_data_1 * n_data_2 / ( n_data_1 + n_data_2 ) ) $
       * abs( float( i_1 ) / n_data_1 - float( i_2 ) / n_data_2 ) $
       / sqrt( ( one * i_1 + i_2 ) / ( n_data_1 + n_data_2 ) $
       * ( 1. - ( one * i_1 + i_2 ) / ( n_data_1 + n_data_2 ) ) )
  cij[n_data_1,n_data_2] = temp le ks_stat
  nij = dblarr( n_data_1 + 1, n_data_2 + 1 )
  nij[0,*] = cij[0,*]
  nij[*,0] = cij[*,0]
  for i_2 = 1, n_data_2 do begin
    for i_1 = 1, n_data_1 do begin
      nij[i_1,i_2] = cij[i_1,i_2] * ( nij[i_1,i_2-1] + nij[i_1-1,i_2] )
    endfor
  endfor
  ; Calculate the p-value
  ;p_value = factorial( n_data_1 ) * factorial( n_data_2 ) $
  ;    * nij[n_data_1+1,n_data_2+1] / factorial( n_data_1 + n_data_2 )
  ; Calculate the p-value using logarithms to allow large factorials
  log_p_value = total( alog10( dindgen( n_data_1 ) + 1. ) ) $
      + total( alog10( dindgen( n_data_2 ) + 1. ) ) $
      + alog10( nij[n_data_1,n_data_2] ) $
      - total( alog10( dindgen( n_data_1 + n_data_2 ) + 1. ) )
  p_value = 10. ^ log_p_Value
  if double_opt eq 0 then p_value = float( p_value )
  p_value = 1. - p_value
; Otherwise the requested method is not yet supported
endif else begin
  stop
endelse
;stop


;***********************************************************************
; The end

return, p_value
END
