#! /usr/local/bin/perl -wc
#
#V  Porter.pm V2.1 21 Jun 1999 with '&$sub if defined' not 'eval ""'
#   Porter.pm V2.0 25 Nov 1994 (for Perl 5.000)
#   porter.pl V1.0 10 Aug 1994 (for Perl 4.036)
#   Jim Richardson, University of Sydney
#   jimr@maths.usyd.edu.au or http://www.maths.usyd.edu.au:8000/jimr.html

#   Find a canonical stem for a word, assumed to consist entirely of
#   lower-case letters.  The approach is from
#
#	M. F. Porter, An algorithm for suffix stripping, Program (Automated
#	Library and Information Systems) 14 (3) 130-7, July 1980.
#
#   This algorithm is used by WAIS: for example, see freeWAIS-0.3 at
#
#	http://kudzu.cnidr.org/cnidr_projects/cnidr_projects.html

#   Some additional rules are used here, mainly to allow for British spellings
#   like -ise.  They are marked ** in the code.

package Porter;

    #  Initialization required before using subroutine stem:

    #  We count syllables slightly differently from Porter: we say the syllable
    #  count increases on each occurrence in the word of an adjacent pair
    #
    #	[aeiouy][^aeiou]
    #
    #  This avoids any need to define vowels and consonants, or confusion over
    #  'y'.  It also works slightly better: our definition gives two syllables
    #  in 'yttrium', while Porter's gives only one because the initial 'y' is
    #  taken to be a consonant.  But it is not quite obvious: for example,
    #  consider 'mayfly' where, when working backwards (see below), the 'yf'
    #  matches the above pattern, even though it is the 'ay' which in Porter's
    #  terms increments the syllable count.
    #
    #  We wish to match the above in context, working backwards from the end of
    #  the word: the appropriate regular expression is
    
    $syl = '[aeiou]*[^aeiou][^aeiouy]*[aeiouy]';

    #  (This works because [^aeiouy] is a subset of [^aeiou].)  If we want two
    #  syllables ("m>1" in Porter's terminology) we can just match $syl$syl.

    #  For step 1b we need to be able to detect the presence of a vowel: here
    #  we revert to Porter's definition that a vowel is [aeiou], or y preceded
    #  by a consonant.  (If the . below is a vowel, then the . is the desired
    #  vowel; if the . is a consonant the y is the desired vowel.)

    $hasvow = '[^aeiouy]*([aeiou]|y.)';

    #  End of initialization

    require Exporter;
    @ISA = Exporter;
    @EXPORT = qw( stem );

sub stem
{
    my ( $word ) = @_;

    #  Reverse the word so we can easily apply pattern matching to the end:

    local( $_ );
    $_ = reverse $word;

    #  Step 1a: plurals -- sses->ss, ies->i, ss->ss, s->0

    m!^s! && ( s!^se(ss|i)!$1! || s!^s([^s])!$1! );

    #  Step 1b: participles -- SYLeed->SYLee, VOWed->VOW, VOWing->VOW;
    #  but ated->ate etc

    s!^dee($syl)!ee$1!o ||
    (
	s!^(de|gni)($hasvow)!$2!o &&
	(
	    #  at->ate, bl->ble, iz->ize, is->ise
	    s!^(ta|lb|[sz]i)!e$1! ||			# ** ise as well as ize
	    #  CC->C (C consonant other than l, s, z)
	    s!^([^aeioulsz])\1!$1! ||
	    #  (m=1) CVD->CVDe (C consonant, V vowel, D consonant not w, x, y)
	    s!^([^aeiouwxy][aeiouy][^aeiou]+)$!e$1!
	)
    );

    #  Step 1c: change y to i: happy->happi, sky->sky

    s!^y($hasvow)!i$1!o;

    #  Step 2: double and triple suffices (part 1)

    #  Switch on last three letters (fails harmlessly if subroutine undefined) --
    #  thanks to Ian Phillipps <ian@dial.pipex.com> who wrote
    #    CPAN authors/id/IANPX/Stem-0.1.tar.gz 
    #  for suggesting the replacement of
    #    eval( '&S2' . unpack( 'a3', $_ ) );
    #  (where the eval ignores undefined subroutines) by the much faster
    #    eval { &{ 'S2' . substr( $_, 0, 3 ) } };
    #  But the following is slightly faster still:

    my $sub;

    &$sub if defined &{ $sub = 'S2' . substr( $_, 0, 3 ) };

    #  Step 3: double and triple suffices, etc (part 2)

    &$sub if defined &{ $sub = 'S3' . substr( $_, 0, 3 ) };

    #  Step 4: single suffices on polysyllables

    &$sub if defined &{ $sub = 'S4' . substr( $_, 0, 2 ) };

    #  Step 5a: tidy up final e -- probate->probat, rate->rate; cease->ceas

    m!^e! && ( s!^e($syl$syl)!$1!o ||

	# Porter's ( m=1 and not *o ) E where o = cvd with d a consonant
	# not w, x or y:

	! m!^e[^aeiouwxy][aeiouy][^aeiou]! &&	# not *o E
	s!^e($syl[aeiouy]*[^aeiou]*)$!$1!o	# m=1
    );

    #  Step 5b: double l -- controll->control, roll->roll
    #  ** Note correction: Porter has m>1 here ($syl$syl), but it seems m>0
    #  ($syl) is wanted to strip an l off controll.

    s!^ll($syl)!l$1!o;

    scalar( reverse $_ );
}

sub S2lan
{
    #  SYLational -> SYLate,	SYLtional -> SYLtion
    s!^lanoita($syl)!eta$1!o || s!^lanoit($syl)!noit$1!o;
}

sub S2icn
{
    #  SYLanci -> SYLance, SYLency ->SYLence
    s!^icn([ae]$syl)!ecn$1!o;
}

sub S2res
{
    #  SYLiser -> SYLise **
    &S2rez;
}

sub S2rez
{
    #  SYLizer -> SYLize
    s!^re(.)i($syl)!e$1i$2!o;
}

sub S2ilb
{
    #  SYLabli -> SYLable, SYLibli -> SYLible ** (e.g. incredibli)
    s!^ilb([ai]$syl)!elb$1!o;
}

sub S2ill
{
    #  SYLalli -> SYLal
    s!^illa($syl)!la$1!o;
}

sub S2ilt
{
    #  SYLentli -> SYLent
    s!^iltne($syl)!tne$1!o
}

sub S2ile
{
    #  SYLeli -> SYLe
    s!^ile($syl)!e$1!o;
}

sub S2ils
{
    #  SYLousli -> SYLous
    s!^ilsuo($syl)!suo$1!o;
}

sub S2noi
{
    #  SYLization -> SYLize, SYLisation -> SYLise**, SYLation -> SYLate
    s!^noita([sz])i($syl)!e$1i$2!o || s!^noita($syl)!eta$1!o;
}

sub S2rot
{
    #  SYLator -> SYLate
    s!^rota($syl)!eta$1!o;
}

sub S2msi
{
    #  SYLalism -> SYLal
    s!^msila($syl)!la$1!o;
}

sub S2sse
{
    #  SYLiveness  -> SYLive, SYLfulness -> SYLful, SYLousness -> SYLous
    s!^ssen(evi|luf|suo)($syl)!$1$2!o;
}

sub S2iti
{
    #  SYLaliti -> SYLal, SYLiviti -> SYLive, SYLbiliti ->SYLble
    s!^iti(la|lib|vi)($syl)! ( $1 eq 'la' ? 'la' : $1 eq 'lib' ? 'elb' : 'evi' )
	. $2 !eo;
}

##

sub S3eta
{
    #  SYLicate -> SYLic
    s!^etaci($syl)!ci$1!o;
}

sub S3evi
{
    #  SYLative -> SYL
    s!^evita($syl)!$1!o;
}

sub S3ezi
{
    #  SYLalize -> SYLal
    s!^ezila($syl)!la$1!o;
}

sub S3esi
{
    #  SYLalise -> SYLal **
    s!^esila($syl)!la$1!o;
}

sub S3iti
{
    #  SYLiciti -> SYLic
    s!^itici($syl)!ci$1!o;
}

sub S3lac
{
    #  SYLical -> SYLic
    s!^laci($syl)!ci$1!o;
}

sub S3luf
{
    #  SYLful -> SYL
    s!^luf($syl)!$1!o;
}

sub S3sse
{
    #  SYLness -> SYL
    s!^ssen($syl)!$1!o;
}

##

sub S4la
{
    #  SYLSYLal -> SYLSYL
    s!^la($syl$syl)!$1!o;
}

sub S4ec
{
    #  SYLSYL[ae]nce -> SYLSYL
    s!^ecn[ae]($syl$syl)!$1!o;
}

sub S4re
{
    #  SYLSYLer -> SYLSYL
    s!^re($syl$syl)!$1!o;
}

sub S4ci
{
    #  SYLSYLic -> SYLSYL
    s!^ci($syl$syl)!$1!o;
}

sub S4el
{
    #  SYLSYL[ai]ble -> SYLSYL
    s!^elb[ai]($syl$syl)!$1!o;
}

sub S4tn
{
    #  SYLSYLant -> SYLSYL, SYLSYLe?ment -> SYLSYL, SYLSYLent -> SYLSYL
    s!^tn(a|e(me?)?)($syl$syl)!$3!o;
}

sub S4no
{
    #  SYLSYL[st]ion -> SYLSYL[st]
    s!^noi([st]$syl$syl)!$1!o;
}

sub S4uo
{
    #  SYLSYLou -> SYLSYL e.g. homologou -> homolog
    s!^uo($syl$syl)!$1!o;
}

sub S4ms
{
    #  SYLSYLism -> SYLSYL
    s!^msi($syl$syl)!$1!o;
}

sub S4et
{
    #  SYLSYLate -> SYLSYL
    s!^eta($syl$syl)!$1!o;
}

sub S4it
{
    #  SYLSYLiti -> SYLSYL
    s!^iti($syl$syl)!$1!o;
}

sub S4su
{
    #  SYLSYLous -> SYLSYL
    s!^suo($syl$syl)!$1!o;
}

sub S4ev
{
    #  SYLSYLive -> SYLSYL
    s!^evi($syl$syl)!$1!o;
}

sub S4ez
{
    #  SYLSYLize -> SYLSYL
    s!^ezi($syl$syl)!$1!o;
}

sub S4es
{
    #  SYLSYLise -> SYLSYL **
    s!^esi($syl$syl)!$1!o;
}

#!#

