(use goo)

;; (use demos/mailboxhead/som/stemmer)

;; Goo port of the Porter stemming algorithm.  Explicitly ported from
;;  JSOmap's 'docorg' previous implementation.  As such it doesn't match up exactly
;;  with the algorithm proper [step2 is apparently the result of arbitrary splitting].
;; All attempts have been made to make this all functional [and slow].  I think I
;;  suceeded on both counts. :D

(dm consonant? (s|<str> i|<int> => <log>)
  (def c (elt s i))
  (case c
    ((#\a #\e #\i #\o #\u)
     #f)
    ((#\y)
     (if (= i 0)
         #t
         (not (consonant? s (- i 1))))
     )
    (#t
     #t)    
    )
  )

(dm measure-consonant-sequences (s|<str> => <int>)
  (def len-str (len s))
  (rep loop ((iChar 0) (con-spans 0) (inCons #t))
    (if (< iChar len-str)
      (if (consonant? s iChar)
          (if inCons
              (loop (+ iChar 1) con-spans #t) ;; c -> c
              (loop (+ iChar 1) (+ con-spans 1) #t)) ;; v -> c              
          (if inCons
              (loop (+ iChar 1) con-spans #f) ;; c -> v
              (loop (+ iChar 1) con-spans #f))) ;; v -> v
      con-spans
      )
    )
  )

(dm vowel-in-string? (s|<str> => <log>)
  (< (measure-consonant-sequences s) (len s))
  )

(df ends-with-double-consonant? (s|<str> => <log>)
  (def l (len s))
  (and (>= l 2)
       (= (elt s (- l 1)) (elt s (- l 2)))
       (consonant? s (- l 1))
       )
  )

(df ends-with-cvc? (s|<str> => <log>)
  (def l (len s))
  (and (>= l 3)
       (consonant? s (- l 1))
       (not (consonant? s (- l 2)))
       (consonant? s (- l 3))
       (case (elt s (- l 1))
         ((#\w #\x #\y)
          #f)
         (#t 
          #t)
         )
       )
    )

(df last-char (s|<str> => (t? <chr>))
  (def l (len s))
  (when (> l 0)
    (elt s (- l 1))
    )
  )
  
(df str-left-sub (s|<str> n|<int>)
  (if (< n 0)
      (sub s 0 (+ n (len s)))
      (sub s 0 n)
      )
  )

(df str-rev-elt (s|<str> n|<int>)
  (elt s (- (- (len s) n) 1))
  )

(dm step1 (s|<str>)
  (if (suffix? s "s")
      (seq
        (if (suffix? s "sses")
            (step1-b (str-left-sub s -2))
            (if (suffix? s "ies")
                (step1-b (str-left-sub s -2))
                (if (~= (str-rev-elt s 1) #\s)
                    (step1-b (str-left-sub s -1))
                    (step1-b s)
                  )
                )
            )
        )
      (step1-b s)
      )
  )

(dm step1-b (s|<str>)
  (if (suffix? s "eed")
      (if (> (measure-consonant-sequences (str-left-sub s -3)) 0)
          (str-left-sub s -1)
          s)
      (cond ((and (suffix? s "ed")
                  (vowel-in-string? (str-left-sub s -2)))
             (or (step1-c (str-left-sub s -2))
                 s)
             )
            ((and (suffix? s "ing")
                  (vowel-in-string? (str-left-sub s -3)))
             (or (step1-c (str-left-sub s -3))
                 s)
             )
            (#t
             s)
            )
      )
  )

(dm step1-c (s|<str>)
  (cond ((suffix? s "at")
         (cat s "e") ;; -> ate
         )
        ((suffix? s "bl")
         (cat s "e") ;; -> ble
         )
        ((suffix? s "iz")
         (cat s "e") ;; -> ize
         )
        ((ends-with-double-consonant? s)
         (case (last-char s)
           ((#\l #\s #\z)
            s ;; stay as you are
            )
           (#t
            (str-left-sub s -1) ;; get rid of the double.
            )
           )
         )
        ((and (= (measure-consonant-sequences s) 1)
              (ends-with-cvc? s))
         (cat s "e")
        )
        (#t
         s
         )
        )
  )

(dm step2 (s|<str>)
  (if (and (suffix? s "y")
           (vowel-in-string? (str-left-sub s -1)))
    (cat (str-left-sub s -1) "i")
    s
    )
  )

(df step3-transform (s|<str> suf|<str> rwith|<str>)
  (when (suffix? s suf)
    (def l (len s))
    (def root (sub s 0 (- l (len suf))))
    (when (> (measure-consonant-sequences root) 0)
      (cat root rwith)
      )
    )
  )

(dm step3 (s|<str>)
  (def l (len s))
  (if (>= l 2)
      (seq
        (case (str-rev-elt s 1)
          ;; a
          ((#\a)
           (or (step3-transform s "ational" "ate")
               (step3-transform s "tional" "tion")
               s
               )
           )
          ;; c
          ((#\c)
           (or (step3-transform s "enci" "ence")
               (step3-transform s "anci" "ance")
               s
               )
           )
          ;; e
          ((#\e)
           (or (step3-transform s "izer" "ize")
               s
               )
           )
          ;; l
          ((#\l)
           (or (step3-transform s "bli" "ble")
               (step3-transform s "alli" "al")
               (step3-transform s "entli" "ent")
               (step3-transform s "eli" "e")
               (step3-transform s "ousli" "ous")
               s
               )
           )
          ;; o
          ((#\o)
           (or (step3-transform s "ization" "ize")
               (step3-transform s "ation" "ate")
               (step3-transform s "ator" "ate")
               s
               )
           )
          ;; s
          ((#\s)
           (or (step3-transform s "alism" "al")
               (step3-transform s "iveness" "ive")
               (step3-transform s "fulness" "ful")
               (step3-transform s "ousness" "ous")
               s
               )
           )
          ;; t
          ((#\t)
           (or (step3-transform s "aliti" "al")
               (step3-transform s "iviti" "ive")
               (step3-transform s "biliti" "ble")
               s
               )
           )
          ;; g
          ((#\g)
           (or (step3-transform s "logi" "log")
               s
               )
           )
          (#t
           s)
          )
        )
      s
      )
  )

(dm step4 (s|<str>)
  (def l (len s))
  (case (last-char s)
    ;; e
    ((#\e)
     (or (step3-transform s "icate" "ic")
         (step3-transform s "ative" "")
         (step3-transform s "alize" "al")
         s
         )
     )
    ;; i
    ((#\i)
     (or (step3-transform s "iciti" "ic")
         s
         )
     )
     ;; l
    ((#\l)
     (or (step3-transform s "ical" "ic")
         (step3-transform s "ful" "")
         s
         )
     )
    ;; s
    ((#\s)
     (or (step3-transform s "ness" "")
         s
         )
     )
    (#t
     s
     )
    )
  )


(df step5-transform (s|<str> suf|<str>)
  (when (suffix? s suf)
    (def l (len s))
    (def root (sub s 0 (- l (len suf))))
    (when (> (measure-consonant-sequences root) 1)
      root
      )
    )
  )

(dm step5 (s|<str>)
  (def l (len s))
  (if (>= l 2)
      (seq
        (case (str-rev-elt s 1)
          ;; a
          ((#\a)
           (or (step5-transform s "al")
               s)
           )
          ;; c
          ((#\c)
           (or (step5-transform s "ance")
               (step5-transform s "ence")
               s)
           )
          ;; e
          ((#\e)
           (or (step5-transform s "er")
               s)
           )
          ;; i
          ((#\i)
           (or (step5-transform s "ic")
               s)
           )
          ;; l
          ((#\l)
           (or (step5-transform s "able")
               (step5-transform s "ible")
               s)
           )
          ;; n
          ((#\n)
           (or (step5-transform s "ant")
               (step5-transform s "ement")
               (step5-transform s "ment")
               (step5-transform s "ent")
               s)
           )
          ;; o
          ((#\o)
           (if (and (suffix? s "ion")
                    (> l 3)
                      (or (= (elt s (- l 4)) #\s)
                          (= (elt s (- l 4)) #\t)))
               (seq
                 (or (step5-transform s "ion")
                     s)
                 )
               (or (step5-transform s "ou")
                   s)
               )
             )
          ;; s
          ((#\s)
           (or (step5-transform s "ism")
               s)
           )
          ;; t
          ((#\t)
           (or (step5-transform s "ate")
               (step5-transform s "iti")
               s)
           )
          ;; u
          ((#\u)
           (or (step5-transform s "ous")
               s)
           )
          ;; v
          ((#\v)
           (or (step5-transform s "ive")
               s)
           )
          ;; z
          ((#\z)
           (or (step5-transform s "ize")
               s)
           )
          ;; ELSE
          (#t
           s)
          )
        )
      s
      )
  )

(dm step6 (s|<str>)
  (if (suffix? s "e")
      (seq
        (def m (measure-consonant-sequences s))
        (if (or (and (= m 1)
                     (not (ends-with-cvc? (str-left-sub s -1))))
                (> m 1))
            (step6-b (str-left-sub s -1))
            (step6-b s)
            )
        )
      (step6-b s)
      )
  )

(dm step6-b (s|<str>)
  (if (and (suffix? s "l")
           (ends-with-double-consonant? s)
           (> (measure-consonant-sequences s) 1))
      (str-left-sub s -1)
      s)
  )

(dm stem (word|<str>)
  (step6 (step5 (step4 (step3 (step2 (step1 word))))))
  )

(export
  stem
  step1
  step2
  step3
  step4
  step5
  step6
  )