#!/usr/bin/perl ## ccluster.pl --- 12.6.2004 # Copyright (C) 2004 Virach Sornlertlamvanich ## Author: Virach SORNLERTLAMVANICH ## Keywords: character clustering tool, Thai text # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2, or (at # your option) any later version. # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. ## Commentary: To cluster Thai text into undividable units. Character # cluster is defined to be the smallest recognizable unit. The character # string is clustered for the sake of avoiding the processing of invalid # Thai character units. ## Usage: ccluster.pl textfile ## Code: eval "exec /usr/local/bin/perl -S $0 $*" if $running_under_some_shell; # this emulates #! processing on NIH machines. # (remove #! line above if indigestible) eval '$'.$1.'$2;' while $ARGV[0] =~ /^([A-Za-z_]+=)(.*)/ && shift; # process any FOO=bar switches #mawk ' # USAGE = "sylsep.mwk filename" # Thai syllable separater. # $[ = 1; # set array base to 1 # USAGE = "sylsep.mwk -debug filename" # if (ARGV[1]=="-debug") { # mode = 1 # delete ARGV[1] # } # Thai consonants พยัญชนะต้น $consonant = '[กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ]'; $consonantq = '[กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ]?'; $tconsonant = '^[กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ]'; # Final consonant (speller) ตัวสะกด $fconsonant = '[กขคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ]'; $tfconsonant = '^[กขคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ]'; # Mixed cluster consonant พยัญชนะควบกล้ำทั้งหมด $consonantc = '[กขคจฉซตถทบปผพฟศส]'; $tconsonantc = '^[กขคจฉซตถทบปผพฟศส]'; # Cluster consonant with ro พยัญชนะควบ "ร" $consonantr = '[กขคจซตทบปพฟศส]'; $tconsonantr = '^[กขคจซตทบปพฟศส]'; # Cluster consonant with lo พยัญชนะควบ "ล" $consonantl = '[กขคฉถบปผพฟ]'; $tconsonantl = '^[กขคฉถบปผพฟ]'; # Cluster consonant with wo พยัญชนะควบ "ว" $consonantw = '[กขค]'; $tconsonantw = '^[กขค]'; # Cluster consonant with ho พยัญชนะ "ห" นำ $consonanth = '[งญนมยรลว]'; # Final character in mae kor kar กก, กด, กบ, กง, กน, กม, + กอ $fkconsonant = '[กดบงนมอ]'; # Thai vowel character $vowel = '[ะาิีึืุูโเแัำไใฤฦ]'; $vowelq = '[ะาิีึืุูโเแัำไใฤฦ]?'; # Thai tonal mark $tone = '[่้๊๋]'; $toneq = '[่้๊๋]?'; # Thai number $number = '[๐๑๒๓๔๕๖๗๘๙]'; # Thai special character $special = '[็์ฯๆาฺํ()!?:;,.]'; # Character that must follow a character $strongbondchar = '[ะัาำิีึืฺุู็่้๊๋์]'; $nstrongbondchar = '[^ะัาำิีึืฺุู็่้๊๋์]'; # Character that never used in start position (of word) $nonstartchar = '^[ะัาำิีึืฺุูๆ็่้๊๋์ฯ]'; # Character that never used in end position (of word) $nonendchar = 'เแโใไั'; # Character that never used in the second position (of word) $nonsecondchar = 'ๆ์'; $rule = ''; # fcon = 1 means the previous unit allow additional final consonant $fcon = 0; while (<>) { chop; # strip record separator @Fld = split(' ', $_, 9999); # if ((mode == 1)&&(debugfile!=FILENAME ".deb")) { # debugfile = FILENAME ".deb" # printf "" > debugfile # } # fn : pick up one field each to analyse # len : length of token (syllable) to skip, # when no syllable rules available it skip 1 char as default # pos : current position in fn # X : substring from pos to the end of fn for ($fn = 1; $fn <= $#Fld; ++$fn) { for ($pos = 1; $pos <= length($Fld[$fn]); $pos += $len) { $X = substr($Fld[$fn], $pos, length($Fld[$fn]) - $pos + 1); # Mode for output the rule trace to debug file # if (mode == 1) { # print x "\t" "rule=" rule "; pos=" pos > debugfile # } ####### Default rule # Rule d1 ; ฯลฯ if ($X =~ '^(ฯลฯ)' && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'd1'; $fcon = 0; next; } # Rule d2 ; Alphabet if ($X =~ '^[A-Za-z]+' && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'd2'; $fcon = 0; next; } # Rule d3 ; Number if ($X =~ '^[0-9]+' && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'd3'; $fcon = 0; next; } # Rule d4 ; Attach ร์, ดิ์, ตร์, ทธิ์, ถุ์ to the existing previous unit if ($X =~ (($tconsonant) . ($consonantq) . ($vowelq) . '์') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { if ($z > 0) { $len = $RLENGTH; $Y{$z} = $Y{$z} . substr($X, 1, $len); $rule = 'd4'; $fcon = 0; next; } else { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'd4'; $fcon = 0; next; } } # Rule d5 ; Attach non-starting char to the existing previous unit # if (match(x,(nonstartchar))) { # if (z>0) { # len = 1 # y[z] = y[z] substr(x,1,1) # rule = "d5" # fcon = 0 # continue # } # } ####### Level 0 very risky rule, apply to all types of cluster consonant. # Rule l0.1 ; ใคร, ใกล้ ???? สา|เกล้|น|แก้ว, โคล้|อมวง # if (match(x,"^[เแไใโ]"(consonantc)"[รลว]"(toneq))) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l0.1" # fcon = 0 # continue # } # Rule l0.2 ; ไหน, ไหล่ if ($X =~ ('^[เแไใโ]' . 'ห' . ($consonanth) . ($toneq)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.2'; $fcon = 0; next; } # Rule l0.3 ; ครัว, ครั้ง ???? สิ|บวัน # if (match(x,(tconsonantc)"[รลว]""ั"(toneq)(fconsonant))) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l0.3" # fcon = 0 # continue # } # Rule l0.4 ; หนัง, หยั่ง if ($X =~ ('^[ห]' . ($consonanth) . 'ั' . ($toneq) . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.4'; $fcon = 0; next; } # ??? Rule l0.5 ; คลำ, ประ, ??? แบ|บระ|บบ, แบ|บระ|ดับ # if (match(x,(tconsonantc)"[รลว]"(toneq)"[ะำ]")) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l0.5" # fcon = 0 # continue # } # Rule l0.6 ; หนำ, หวะ if ($X =~ ('^[ห]' . ($consonanth) . ($toneq) . '[ะำ]') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.6'; $fcon = 0; next; } # Rule l0.7 ; เสร็จ if ($X =~ ('^[เแ]' . ($consonantc) . '[รลว]' . '็' . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.7'; $fcon = 0; next; } # Rule l0.8 ; if ($X =~ ('^[เแ]' . 'ห' . ($consonanth) . '็' . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.8'; $fcon = 0; next; } # Rule l0.9 ; ???? # if (match(x,"^[เแโ]"(consonantc)"[รลว]"(toneq)"ะ")) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l0.9" # fcon = 0 # continue # } # Rule l0.10 if ($X =~ ('^[เแโ]' . 'ห' . ($consonanth) . ($toneq) . 'ะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.10'; $fcon = 0; next; } # Rule l0.11 if ($X =~ ('^[เ]' . ($consonantc) . '[รลว]' . ($toneq) . 'าะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.11'; $fcon = 0; next; } # Rule l0.12 if ($X =~ ('^[เ]' . 'ห' . ($consonanth) . ($toneq) . 'าะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.12'; $fcon = 0; next; } # Rule l0.13 ; if ($X =~ ('^[เ]' . ($consonantc) . '[รลว]' . ($toneq) . 'า') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.13'; $fcon = 0; next; } # Rule l0.14 if ($X =~ ('^[เ]' . 'ห' . ($consonanth) . ($toneq) . 'า') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.14'; $fcon = 0; next; } # Rule l0.15 if ($X =~ ('^[เ]' . ($consonantc) . '[รลว]' . ($toneq) . 'อะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.15'; $fcon = 0; next; } # Rule l0.16 if ($X =~ ('^[เ]' . 'ห' . ($consonanth) . ($toneq) . 'อะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.16'; $fcon = 0; next; } # Rule l0.17 ???? # if (match(x,"^[เ]"(consonantc)"[รลว]""ิ"(toneq)(fconsonant))) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l0.17" # fcon = 0 # continue # } # Rule l0.18 if ($X =~ ('^[เ]' . 'ห' . ($consonantch) . 'ิ' . ($toneq) . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.18'; $fcon = 0; next; } # Rule l0.19 if ($X =~ ('^[เ]' . ($consonantc) . '[รลว]' . 'ี' . ($toneq) . 'ยะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.19'; $fcon = 0; next; } # Rule l0.20 if ($X =~ ('^[เ]' . 'ห' . ($consonanth) . 'ี' . ($toneq) . 'ยะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.20'; $fcon = 0; next; } # Rule l0.21 if ($X =~ ('^[เ]' . ($consonanc) . '[รลว]' . 'ื' . ($toneq) . 'อะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.21'; $fcon = 0; next; } # Rule l0.22 if ($X =~ ('^[เ]' . 'ห' . ($consonanh) . 'ื' . ($toneq) . 'อะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l0.22'; $fcon = 0; next; } ####### Level 1 cluster consonant may take additional final consonant. # Rule l1.1 if ($X =~ ('^[เ]' . ($consonantc) . '[รลว]' . '[ี]?' . ($toneq) . 'ย') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l1.1'; $fcon = 1; next; } # Rule l1.2 ; เหลี่ย|ม if ($X =~ ('^[เ]' . 'ห' . ($consonanth) . '[ี]?' . ($toneq) . 'ย') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l1.2'; $fcon = 1; next; } # Rule l1.3 ; เคลือ if ($X =~ ('^[เ]' . ($consonantc) . '[รลว]' . '[ื]?' . ($toneq) . 'อ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l1.3'; $fcon = 1; next; } # Rule l1.4 ; เหลือ if ($X =~ ('^[เ]' . 'ห' . ($consonanth) . '[ื]?' . ($toneq) . 'อ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l1.4'; $fcon = 1; next; } # Rule l1.5 ; ???? มา|กว่า # if (match(x,(tconsonantc)"[รลว]"(toneq)"า")) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l1.5" # fcon = 1 # continue # } # Rule l1.6 ; หนา, หน้า if ($X =~ ('^[ห]' . ($consonanth) . ($toneq) . 'า') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l1.6'; $fcon = 1; next; } # Rule l1.7 ; ???? สรุ|ป # if (match(x,(tconsonantc)"[รลว]""[ิีึืุู]"(toneq))) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = "l1.7" # fcon = 1 # continue # } # Rule l1.8 ; หรื|อ if ($X =~ ('^[ห]' . ($consonanth) . '[ิีึืุู]' . ($toneq)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'l1.8'; $fcon = 1; next; } ####### Level 2 never take any more final consonant. # Rule 1 if ($X =~ (($tconsonant) . 'ั' . ($toneq) . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '1'; $fcon = 0; next; } # Rule 1.1 if ($X =~ (($tconsonant) . '[็]') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '1.1'; $fcon = 0; next; } # Rule 1.2 if ($X =~ (($tconsonant) . ($toneq) . '[ะำ]') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '1.2'; $fcon = 0; next; } # Rule 2 if ($X =~ (($tconsonant) . '[ั]' . ($toneq) . 'วะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '2'; $fcon = 0; next; } # Rule 3 if ($X =~ ('^[เแ]' . ($consonant) . '็' . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '3'; $fcon = 0; next; } # Rule 4 if ($X =~ ('^[เแโ]' . ($consonant) . ($toneq) . 'ะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '4'; $fcon = 0; next; } # Rule 5 if ($X =~ ('^[เ]' . ($consonant) . ($toneq) . 'าะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '5'; $fcon = 0; next; } # Rule 6 if ($X =~ ('^[เ]' . ($consonant) . ($toneq) . 'า') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '6'; $fcon = 0; next; } # Rule 7 if ($X =~ ('^[เ]' . ($consonant) . ($toneq) . 'อะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '7'; $fcon = 0; next; } # Rule 8 if ($X =~ ('^[เ]' . ($consonant) . 'ิ' . ($toneq) . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '8'; $fcon = 0; next; } # Rule 9 if ($X =~ ('^[เ]' . ($consonant) . 'ี' . ($toneq) . 'ยะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '9'; $fcon = 0; next; } # Rule 10 if ($X =~ ('^[เ]' . ($consonant) . 'ื' . ($toneq) . 'อะ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '10'; $fcon = 0; next; } # Rule 11 # if (match(x,(tconsonant)(toneq)"ว"(fconsonant))) { # len = RLENGTH # y[++z] = substr(x,1,len) # rule = 11 # continue # } # Rule 12 if ($X =~ (($tconsonant) . 'ฤ' . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '12'; $fcon = 0; next; } # Rule 12.1 if ($X =~ ('^[ใ]' . ($consonant) . ($toneq)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '12.1'; $fcon = 0; next; } # Rule 16.1 if ($X =~ (($tconsonant) . '[ึื]' . ($toneq) . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '16.1'; $fcon = 0; next; } ####### Level 3 may take additional final consonant. # Rule 13 if ($X =~ ('^[เ]' . ($consonant) . '[ี]?' . ($toneq) . 'ย') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '13'; $fcon = 1; next; } # Rule 14 if ($X =~ ('^[เ]' . ($consonant) . '[ื]?' . ($toneq) . 'อ') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '14'; $fcon = 1; next; } # Rule 15 if ($X =~ (($tconsonant) . ($toneq) . 'า') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '15'; $fcon = 1; next; } # Rule 16 if ($X =~ (($tconsonant) . '[ิีุู]' . ($toneq)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '16'; $fcon = 1; next; } # Rule 17 if ($X =~ ('^[เแไโ]' . ($consonant) . ($toneq)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = '17'; $fcon = 1; next; } ####### Ad hoc rule # Rule a1 ; อย่า if ($X =~ ('^(อย)' . ($toneq) . 'า') && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'a1'; $fcon = 1; next; } # Rule a2 ; อยู่ if ($X =~ ('^(อย)' . '[ุู]' . ($toneq)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'a2'; $fcon = 1; next; } # ตรร, ??? วงจรรวม # if (length(x)>=4) { # if (match(x,(tconsonant)"รร"(nstrongbondchar))) { # len = RLENGTH-1 # y[++z] = substr(x,1,len) # fcon = 1 # continue # }} else { # if (match(x,(tconsonant)"รร")) { # len = RLENGTH # y[++z] = substr(x,1,len) # fcon = 1 # continue # }} # # Rule a3 ; หล่น, หย่น if ($X =~ ('^[ห]' . ($consonanth) . ($tone) . ($fkconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'a3'; $fcon = 1; next; } # Rule a4 ; หน่วย if ($X =~ ('^[ห]' . ($consonanth) . ($tone) . 'ว' . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'a4'; $fcon = 1; next; } # Rule a5 ; ช่ง if ($X =~ (($tconsonant) . ($tone) . ($fkconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'a5'; $fcon = 1; next; } # Rule a6 ; อ้วน if ($X =~ (($tconsonant) . ($tone) . 'ว' . ($fconsonant)) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { $len = $RLENGTH; $Y{++$z} = substr($X, 1, $len); $rule = 'a6'; $fcon = 1; next; } # ช่วย ???? ต่อคณะ # if (length(x)>=5) { # if (match(x,(tconsonant)(tone)(consonant)(consonant)(nstrongbondchar))) { # len = RLENGTH-1 # y[++z] = substr(x,1,len) # fcon = 1 # continue # }} else { # if (match(x,(tconsonant)(tone)(consonant)(consonant))) { # len = RLENGTH # y[++z] = substr(x,1,len) # fcon = 1 # continue # }} # Rule d5 ; Attach non-starting char to the existing previous unit if ($X =~ ($nonstartchar) && ($RLENGTH = length($&), $RSTART = length($`)+1)) { if ($z > 0) { $len = 1; $Y{$z} = $Y{$z} . substr($X, 1, 1); $rule = 'd5'; $fcon = 0; next; } } # Add the one uncombinable consonant to the previous unit if the previous # unit allows an additional final consonant else leave it as an uncombined # consonant. # if ((fcon==1)&&(match(x,tconsonant))) { # y[z] = y[z] substr(x,1,1) # len = 1 # fcon = 1 # rule = 0 # continue # } else { $Y{++$z} = substr($X, 1, 1); $len = 1; $rule = 0; # if (match(x,tconsonant)) { # fcon = 1 # } else { # fcon = 0 # } next; # } ; } for ($i = 1; $i < $z; $i++) { printf '%s|', $Y{$i}; $Y{$i} = ''; } if ($fn == $#Fld) { printf "%s|\n", $Y{$i}; } else { printf '%s| |', $Y{$i}; } $Y{$i} = ''; $z = 0; } #}' $* }