\  *******************************************************************
\  *                                                                 *
\  *  Wil Baden  2003-02-14                                          *
\  *                                                                 *
\  *                  The Porter Stemming Algorithm                  *
\  *                                                                 *
\  *  "An Algorithm for Suffix Stripping", M.F.Porter, 1980          *
\  *                                                                 *
\  *  Translated from c_thread_safe.c version                        *
\  *								     *
\  *  The link to the Porter stemmer homepage is		     *
\  *     http://www/tartarus.org/~martin/PorterStemmer/              *
\  *                                                                 *
\  *******************************************************************

\ Required for kForth
include strings
include ans-words


\ TRUE 0= [IF]  \  Comment out what you already have.

     : THIRD   2 PICK ;
     : FOURTH  3 PICK ;
     : ANDIF   S" DUP IF DROP " EVALUATE ; IMMEDIATE
     : ORIF    S" DUP 0= IF DROP " EVALUATE ; IMMEDIATE
     : 3DROP   2DROP DROP ;
     : End-C@  1- CHARS + C@ ;

\ [THEN]

\  Cons-End is TRUE <=> last letter is a consonant.

: Cons-End     ( z i -- flag )
     CASE 2dup End-C@       ( z i c )
     [char] a OF  FALSE  ENDOF
     [char] e OF  FALSE  ENDOF
     [char] i OF  FALSE  ENDOF
     [char] o OF  FALSE  ENDOF
     [char] u OF  FALSE  ENDOF
     [char] y OF
         dup 1 = IF  TRUE  ELSE  2dup 1- RECURSE NOT  THEN
     ENDOF
         DROP TRUE
     0 ENDCASE
     NIP NIP ;

\  MEASURE measures the number of consonant sequences between 0 and j.
\  if c is a consonant sequence and v a vowel sequence, and <..>
\  indicates arbitrary presence,

\      <c><v>       gives 0
\      <c>vc<v>     gives 1
\      <c>vcvc<v>   gives 2
\      <c>vcvcvc<v> gives 3
\      ....

: MEASURE    ( z j -- m )
     0 0 2SWAP                     ( n i z j)
     BEGIN
         THIRD over > IF
             3DROP
         EXIT THEN
         over FOURTH Cons-End
     WHILE
         2>R  1+  2R>
     REPEAT
     2>R  1+  2R>
     BEGIN
         BEGIN
             THIRD over > IF
                 3DROP
             EXIT THEN
             over FOURTH Cons-End
         NOT WHILE
             2>R  1+  2R>
         REPEAT
         2>R  1+  >R  1+  R>  2R>
         BEGIN
             THIRD over > IF
                 3DROP
             EXIT THEN
             over FOURTH Cons-End
         WHILE
             2>R  1+  2R>
         REPEAT
         2>R  1+  2R>
     AGAIN ;

\  Vowel-in-Stem is TRUE <=> 0,...j contains a vowel

: Vowel-in-Stem    ( z n -- flag )
     0 ?DO
         dup I 1+ Cons-End NOT
             IF  DROP TRUE UNLOOP  EXIT THEN
     LOOP
     DROP FALSE ;

\  Doubled-End is TRUE <=> j,(j-1) contain a double consonant.

: Doubled-End     ( z i -- flag )
     dup 2 < IF  2DROP  FALSE  EXIT THEN
     2dup 2 - chars + dup C@ SWAP char+ C@ <>
         IF  2DROP  FALSE  EXIT  THEN
     Cons-End ;

\  CVC is TRUE <=> i-2,i-1,i has the form consonant - vowel -
\  consonant and also if the second c is not w ,x or y. this is used
\  when trying to restore an e at the end of a short word. e.g.

\     cav(e), lov(e), hop(e), crim(e), but
\     snow, box, tray.

: CVC        ( z i -- flag )
     dup 3 <           IF  2DROP  FALSE  EXIT THEN
     2dup Cons-End NOT     IF  2DROP  FALSE  EXIT THEN
     2dup 1- Cons-End      IF  2DROP  FALSE  EXIT THEN
     2dup 2 - Cons-End NOT IF  2DROP  FALSE  EXIT THEN
     1- chars + C@ dup [char] w = over [char] x = OR SWAP [char] y = OR
                       IF  FALSE  EXIT THEN
     TRUE ;

\  ENDS? is TRUE <=> 0,...k ends with the string s.

: ENDS?    ( z j ending i -- z j' flag )
     2over  dup FOURTH - /STRING  COMPARE 0= ;

\  setto(z, s) sets (j+1),...k to the characters in the string s,
\  readjusting k. */

: JOIN         ( z j s i -- z j+i )
     >R  THIRD THIRD chars +  R@ MOVE  R> + ;

\  STEP1AB gets rid of plurals and -ed or -ing. e.g.

\       caresses  ->  caress
\       ponies    ->  poni
\       ties      ->  ti
\       caress    ->  caress
\       cats      ->  cat

\       feed      ->  feed
\       agreed    ->  agree
\       disabled  ->  disable

\       matting   ->  mat
\       mating    ->  mate
\       meeting   ->  meet
\       milling   ->  mill
\       messing   ->  mess

\       meetings  ->  meet

: STEP1AB                  ( z j s i -- z i' )
     2dup END-C@ [char] s = IF
         S" sses" ENDS? IF  2 -  EXIT THEN
         S" ies" ENDS? IF  2 -  EXIT THEN
         2dup 1- END-C@ [char] s <> IF  1-  ELSE  EXIT THEN
     THEN

     S" eed" ENDS? IF
         2dup 3 - MEASURE 0> IF  1-  THEN
     EXIT THEN

     S" ed" ENDS? IF
     	2dup 2 - Vowel-in-Stem NOT IF  EXIT  THEN
     	2 -
     ELSE
     S" ing" ENDS? IF
     	2dup 3 - Vowel-in-Stem NOT IF  EXIT  THEN
     	3 -
     ELSE
     	EXIT
     THEN THEN

     S" at" ENDS?  ORIF S" bl" ENDS?  ORIF  S" iz" ENDS? THEN THEN
         IF  S" e" JOIN  EXIT THEN

     2dup Doubled-End IF
         1-
         2dup END-C@
         	dup [char] l = over [char] s = OR SWAP [char] z = OR
             	IF  1+  THEN
     EXIT THEN

     2dup MEASURE 1 = ANDIF
     2dup CVC         THEN
         IF  S" e" JOIN  THEN
     ;

\  STEP1C turns terminal y to i when there is another vowel in the stem.

: STEP1C   ( z j -- z j' )
     S" y" ENDS? IF
         2dup 1- Vowel-in-Stem
             IF  1- S" i" JOIN  THEN
     THEN ;

\  step2(z) maps double suffices to single ones. so -ization ( = -ize
\  plus -ation) maps to -ize etc. note that the string before the
\  suffix must give m(z) > 0.

: STEP2          ( z j -- z j-i )
     CASE     2dup 1- END-C@
     [char] a OF
         S" ational" ENDS? IF
             2dup  7 - MEASURE 0>
                 IF  7 -  S" ate" JOIN  THEN
         EXIT THEN
         S" tional" ENDS? IF
             2dup  6 - MEASURE 0>
                 IF  6 -  S" tion" JOIN  THEN
         EXIT THEN
     ENDOF
     [char] c OF
         S" enci" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -  S" ence" JOIN  THEN
         EXIT THEN
         S" anci" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -  S" ance" JOIN  THEN
         EXIT THEN
     ENDOF
     [char] e OF
         S" izer" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -  S" ize" JOIN  THEN
         EXIT THEN
     ENDOF
     [char] g OF
         S" logi" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -  S" log" JOIN  THEN
         EXIT THEN
     ENDOF
     [char] l OF
         S" bli" ENDS? IF
         	2dup  3 - MEASURE 0>
             	IF  3 -  S" ble" JOIN  THEN
         EXIT THEN
         S" alli" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -  S" al" JOIN  THEN
         EXIT THEN
         S" entli" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" ent" JOIN  THEN
         EXIT THEN
         S" eli" ENDS? IF
         	2dup  3 - MEASURE 0>
             	IF  3 -  S" e" JOIN  THEN
         EXIT THEN
         S" ousli" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" ous" JOIN  THEN
         EXIT THEN
     ENDOF
     [char] o OF
         S" ization" ENDS? IF
         	2dup  7 - MEASURE 0>
             	IF  7 -  S" ate" JOIN  THEN
         EXIT THEN
         S" ation" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" ate" JOIN  THEN
         EXIT THEN
         S" ator" ENDS? ANDIF  2dup  4 - MEASURE 0>  THEN
             IF  4 -  S" ate" JOIN  EXIT THEN
     ENDOF
     [char] s OF
         S" alism" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" al" JOIN  THEN
         EXIT THEN
         S" iveness" ENDS? IF
         	2dup  7 - MEASURE 0>
             	IF  7 -  S" ive" JOIN  THEN
         EXIT THEN
         S" fulness" ENDS? IF
         	2dup  7 - MEASURE 0>
             	IF  7 -  S" ful" JOIN  THEN
         EXIT THEN
         S" ousness" ENDS? IF
         	2dup  7 - MEASURE 0>
             	IF  7 -  S" ous" JOIN  THEN
         EXIT THEN
     ENDOF
     [char] t OF
         S" aliti" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" al" JOIN  THEN
         EXIT THEN
         S" iviti" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" ive" JOIN  THEN
         EXIT THEN
         S" biliti" ENDS? IF
         	2dup  6 - MEASURE 0>
             	IF  6 -  S" ble" JOIN  THEN
         EXIT THEN
     ENDOF
     ENDCASE ;

\  STEP3 deals with -ic-, -full, -ness etc. similar strategy to step2.

: STEP3
	CASE  2dup END-C@
	[char] e OF
         S" icate" ENDS? if
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" ic" JOIN  THEN
         EXIT THEN
         S" ative" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -               THEN
         EXIT THEN
         S" alize" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" al" JOIN  THEN
         EXIT THEN
	ENDOF
	[char] i OF
         S" iciti" ENDS? IF
         	2dup  5 - MEASURE 0>
             	IF  5 -  S" ic" JOIN  THEN
         EXIT THEN
	ENDOF
	[char] l OF
         S" ical" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -  S" ic" JOIN  THEN
         EXIT THEN
         S" ful" ENDS? IF
         	2dup  3 - MEASURE 0>
             	IF  3 -               THEN
         EXIT THEN
	ENDOF
	[char] s OF
         S" ness" ENDS? IF
         	2dup  4 - MEASURE 0>
             	IF  4 -               THEN
         EXIT THEN
	ENDOF
     ENDCASE ;

\  STEP4 takes off -ant, -ence etc., in context <c>vcvc<v>.

: STEP4
	CASE  2dup 1- end-c@
	[char] a OF
         S" al" ENDS? IF
         	2dup 2 - MEASURE 1 >
             	IF  2 -  THEN
         EXIT THEN
     ENDOF
	[char] c OF
         S" ance" ENDS?  IF
         	2dup 4 - MEASURE 1 >
             	IF  4 -  THEN
         EXIT THEN
         S" ence" ENDS?  IF
         	2dup 4 - MEASURE 1 >
             	IF  4 -  THEN
         EXIT THEN
     ENDOF
	[char] e OF
         S" er" ENDS?  IF
         	2dup 2 - MEASURE 1 >
             	IF  2 -  THEN
         EXIT THEN
     ENDOF
	[char] i OF
         S" ic" ENDS?  IF
         	2dup 2 - MEASURE 1 >
             	IF  2 -  THEN
         EXIT THEN
     ENDOF
	[char] l OF
         S" able" ENDS?  IF
         	2dup 4 - MEASURE 1 >
             	IF  4 -  THEN
         EXIT THEN
         S" ible" ENDS?  IF
         	2dup 4 - MEASURE 1 >
             	IF  4 -  THEN
         EXIT THEN
     ENDOF
	[char] n OF
         S" ant" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
         S" ement" ENDS?  IF
         	2dup 5 - MEASURE 1 >
             	IF  5 -  THEN
         EXIT THEN
         S" ment" ENDS?  IF
         	2dup 4 - MEASURE 1 >
             	IF  4 -  THEN
         EXIT THEN
         S" ent" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
     ENDOF
	[char] o OF
		S" tion" ENDS?  IF
			2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
		S" sion" ENDS?  IF
			2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
         S" ou" ENDS? 	IF
         	2dup 2 - MEASURE 1 >
             	IF  2 -  THEN
         EXIT THEN
     ENDOF
	[char] s OF
         S" ism" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
     ENDOF
	[char] t OF
         S" ate" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
         S" iti" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
     ENDOF
	[char] u OF
         S" ous" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
     ENDOF
	[char] v OF
         S" ive" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
     ENDOF
	[char] z OF
         S" ize" ENDS?  IF
         	2dup 3 - MEASURE 1 >
             	IF  3 -  THEN
         EXIT THEN
     ENDOF
     ENDCASE ;


\  STEP5 removes a final -e if m(z) > 1, and changes -ll to -l if
\  m(z) > 1.

: STEP5
     S" e" ENDS? IF
         2dup 1- MEASURE 1 > IF  1-
         ELSE
         2dup 1- MEASURE 1 =  ANDIF  2dup 1- CVC NOT THEN
             IF  1-  THEN
         THEN
     THEN
     S" ll" ENDS? IF
         2dup  1- MEASURE 1 >
             IF 1- THEN
     THEN ;


: STEM    ( z j -- z' j' )
     >R  PAD R@ MOVE  PAD R>
     dup 3 < IF  EXIT  THEN
     STEP1AB
     STEP1C
     STEP2
     STEP3
     STEP4
     STEP5
     ;

\ \   //   \\   //   \\   //   \\   //   \\   //   \\   //   \\   //   \\

