shithub: riscv

Download patch

ref: f337516664d52d6211ded7a74237c72057734e1c
parent: 8175958b19d8b34bd96f9199e7face8470fa5cf2
author: Jacob Moody <moody@posixcafe.org>
date: Wed Oct 12 14:17:49 EDT 2022

ktrans: revisit grammer handling and do some spring cleaning

Revisiting the man page example and README from the
original ktrans there was some descripency on how
to handle punction marks and special characters.

Notably 。(.) and 、(,) need special casing. If we
have anything in the buffer we need to tack on
these characters as an Okuri tail and avoid the actual
punction from entering the Kanji buffer.

Newlines were previously modified to be taken as a completion
if there was runes in the buffer. This has been backed out,
instead Shift + Space can serve this role via kbmap should they
prefer (and as is done in the jp kbmap). Instead we treat newlines
as hints to reset the buffer.

There was also a bug in where after cycling through all options the
original hiragana was not printed back. This has been corrected.

The max number of candidates has been bumped to 32 and moved to an enum.
This does nearly double our resident memory size, but we reguarly had
matches exceed this limit. A better solution is slated.

The man page now makes an attempt to explain the rules around Okuri and
Joshi input modes.

--- a/sys/man/1/ktrans
+++ b/sys/man/1/ktrans
@@ -91,15 +91,16 @@
 transliterated but discarded, providing a scratch input space. The 
 .B -G
 option disables this display.
-.SH JAPANESE
-The Hiragana and Katakana modes implicitly turn Hepburn representations
-in to their Kana counterparts. Explicit conversions combine sequences
-of Hiragana in to Kanji.
-.PP
+.SH "KEY MAPPING"
+For convenience, the control characters used by
+.I ktrans
+can be mapped directly to physical keys through modifications
+of the kbmap (see
+.IR kbdfs (8)).
 The
 .B /sys/lib/kbmap/jp
-keyboard map will turn the language input keys
-present on OADG 109(A) keyboards in to control
+mapping will turn language input keys
+present on Japanese A01/106/109(A) in to control
 sequences matching their label:
 .TP
 .B Henkan
@@ -113,9 +114,42 @@
 .TP
 .B Shift + Hiragana / Katakana
 Switch to Katakana (ctl-v)
+.TP
+.B Hankaku / Zenkaku
+Switch to Hiragana (ctl-n)
+.TP
+.B Shift + Hankaku / Zenkaku
+Switch to passthrough (ctl-t)
+.TP
+.B Shift + Space
+Convert to Kanji (ctl-\e).
+This is a fallback for keyboards without a physical Henkan key.
+.SH JAPANESE
+The Hiragana and Katakana modes implicitly turn Hepburn representations
+in to their Kana counterparts. Explicit conversions combine sequences
+of Hiragana in to Kanji.
+.PP
+Capital Latin input is used for hinting. For adjectives and verbs, a single
+capital is used as an Okurigana hint. For example,
+.ft Jp
+動かす
+.ft
+is typed as 'ugoKasu[^\e]'. The hint serves two purposes, it is
+provided as part of the explicit sequence for Kanji lookup and denotes
+that the following runes are Okurigana. 
+.PP
+For particles, the entire Kana may be input in upper case. This similarly
+denotes the end of the Kanji portion of the sequence, but is not used
+as part of the lookup sequence itself. So to write
+.ft Jp
+私の猫
+.ft
+the user types "watashiNO[^\e]neko[^\e]". Note that in both cases
+we have successfully communicated to krans when to reset the explicit
+match buffer without needing to explicitily give a ctl-l character.
 .SH CHINESE
 The Wubizixing input method is used. No implicit conversion is done,
-explicit conversion interprets latin characters as their Wubi counterparts
+explicit conversion interprets Latin characters as their Wubi counterparts
 to do lookup of Hanzi.
 .SH RUSSIAN
 Implicit layer converts latin to Cyrillic; the transliteration is mostly
@@ -169,7 +203,9 @@
 .IR /sys/src/cmd/ktrans/READMEJ.kenji
 .SH BUGS
 .PP
-There is no way to generate the control characters literally.
+There is no hint from rio when the user moves the cursor, as such
+moving it is unlikely to result in what the user expects.
+.PP
 Plan9 lacks support for rendering combinational Unicode sequences,
 limiting the use of some code ranges.
 .SH HISTORY
--- a/sys/src/cmd/ktrans/main.c
+++ b/sys/src/cmd/ktrans/main.c
@@ -141,6 +141,10 @@
 	return h;
 }
 
+enum{
+	Maxkouho=32,
+};
+
 Hmap*
 opendict(Hmap *h, char *name)
 {
@@ -147,7 +151,7 @@
 	Biobuf *b;
 	char *p;
 	char *dot, *rest;
-	char *kouho[16];
+	char *kouho[Maxkouho];
 	int i;
 
 	b = Bopen(name, OREAD);
@@ -295,7 +299,7 @@
 	Mouse m;
 	Keyboardctl *kctl;
 	Rune key;
-	char *kouho[16+1+1], **s;
+	char *kouho[Maxkouho+1], **s;
 	Image *back, *text, *board, *high;
 	Font *f;
 	Point p;
@@ -412,7 +416,7 @@
 	int n;
 	char *p;
 	Hmap *dict;
-	char *kouho[16];
+	char *kouho[Maxkouho];
 	Str line;
 	Str last;
 	Str okuri;
@@ -427,8 +431,8 @@
 
 	dict = jisho;
 	selected = -1;
-	kouho[0] = nil;
 	mode = Kanji;
+	memset(kouho, 0, sizeof kouho);
 	resetstr(&last, &line, &okuri, nil);
 
 	threadsetname("dict");
@@ -435,16 +439,6 @@
 	while(recv(dictch, m) != -1){
 		for(p = m+1; *p; p += n){
 			n = chartorune(&r, p);
-			if(r != ''){
-				selected = -1;
-				kouho[0] = nil;
-				if(selected >= 0){
-					resetstr(&okuri, nil);
-					mode = Kanji;
-					send(selectch, &selected);
-				}
-				resetstr(&last, nil);
-			}
 			switch(r){
 			case LangJP:
 				dict = jisho;
@@ -459,12 +453,9 @@
 				}
 				emitutf(output, backspace, utflen(line.b));
 				/* fallthrough */
-			case ' ': case ',': case '.':
-			case '':
+			case '': case ' ': case '\n':
 				mode = Kanji;
-				resetstr(&line, &okuri, nil);
-				memset(kouho, 0, sizeof kouho);
-				send(displaych, kouho);
+				resetstr(&line, &okuri, &last, nil);
 				break;
 			case '\b':
 				if(mode != Kanji){
@@ -477,29 +468,20 @@
 				}
 				popstr(&line);
 				break;
-			case '\n':
-				if(line.b == line.p){
-					emitutf(output, "\n", 1);
-					break;
-				}
-				/* fallthrough */
 			case '':
 				selected++;
 				if(selected == 0){
-					if(hmapget(dict, line.b, kouho) < 0){
-						resetstr(&line, &last, nil);
-						selected = -1;
+					if(hmapget(dict, line.b, kouho) < 0)
 						break;
-					}
 					if(dict == jisho && line.p > line.b && isascii(line.p[-1]))
 						line.p[-1] = '\0';
 				}
 				if(kouho[selected] == nil){
 					/* cycled through all matches; bail */
+					emitutf(output, backspace, utflen(okuri.b));
 					emitutf(output, backspace, utflen(last.b));
 					emitutf(output, line.b, 0);
-					resetstr(&line, &last, &okuri, nil);
-					selected = -1;
+					emitutf(output, okuri.b, 0);
 					break;
 				}
 				send(selectch, &selected);
@@ -515,9 +497,16 @@
 				emitutf(output, kouho[selected], 0);
 				last.p = pushutf(last.b, strend(&last), kouho[selected], 0);
 				emitutf(output, okuri.b, 0);
-
-				resetstr(&line, nil);
 				mode = Kanji;
+				continue;
+			case ',': case '.':
+			case L'。': case L'、':
+				if(dict == zidian || line.p == line.b){
+					selected = 0; //hit cleanup below
+					break;
+				}
+				mode = Joshi;
+				okuri.p = pushutf(okuri.p, strend(&okuri), p, 1);
 				break;
 			default:
 				if(dict == zidian)
@@ -529,8 +518,7 @@
 					if(mode == Okuri){
 						popstr(&line);
 						mode = Joshi;
-						okuri.p = pushutf(okuri.p, strend(&okuri), p, 1);
-						break;
+						goto Okuri;
 					}
 					mode = Okuri;
 					*p = tolower(*p);
@@ -537,40 +525,51 @@
 					okuri.p = pushutf(okuri.b, strend(&okuri), p, 1);
 					goto Line;	
 				}
-				if(mode != Kanji){
-			Okuri:
+
+				switch(mode){
+				case Kanji:
+				Line:
+					line.p = pushutf(line.p, strend(&line), p, 1);
+					break;
+				default:
+				Okuri:
 					okuri.p = pushutf(okuri.p, strend(&okuri), p, 1);
 					break;
 				}
-			Line:
-				line.p = pushutf(line.p, strend(&line), p, 1);
-				memset(kouho, 0, sizeof kouho);
-				if(hmapget(dict, line.b, kouho) == 0){
-					selected = -1;
-					send(selectch, &selected);
-				}
-				send(displaych, kouho);
-				break;
 			}
+
+			if(selected >= 0){
+				resetstr(&okuri, &last, &line, nil);
+				selected = -1;
+				send(selectch, &selected);
+			}
+			memset(kouho, 0, sizeof kouho);
+			hmapget(dict, line.b, kouho);
+			send(displaych, kouho);
 		}
 	}
 }
 
-static int
-telexlkup(Str *line, Str *out)
+static void
+telexlkup(Str *line)
 {
 	Map lkup;
 	char buf[UTFmax*3], *p, *e;
-	int n;
+	Str out;
+	int n, ln;
 
+Again:
+	ln = utflen(line->b);
 	p = pushutf(buf, buf+sizeof buf, line->b, 1);
 	n = p-buf;
 
-	if(hmapget(telex, buf, &lkup) < 0)
-		return -1;
+	if(hmapget(telex, buf, &lkup) < 0){
+		resetstr(line, nil);
+		return;
+	}
 
 	if(utflen(line->b) < 2)
-		return 2;
+		return;
 
 	e = peekstr(line->p, line->b);
 	pushutf(p, buf+sizeof buf, e, 1);
@@ -578,16 +577,20 @@
 		/* not correct; matches should be allowed to span vowels */
 		if(hmapget(telex, buf+n, &lkup) == 0)
 			line->p = pushutf(line->b, strend(line), buf+n, 0);
-		return 2;
+		return;
 	}
 
-	out->p = pushutf(out->b, strend(out), lkup.kana, 0);
-	out->p = pushutf(out->p, strend(out), line->b+n, 0);
-	popstr(out);
+	out.p = pushutf(out.b, strend(&out), lkup.kana, 0);
+	out.p = pushutf(out.p, strend(&out), line->b+n, 0);
+	popstr(&out);
 
+	if(ln > 0)
+		emitutf(output, backspace, ln);
+	emitutf(output, out.b, 0);
+	line->p = pushutf(line->b, strend(line), out.b, 0);
 	if(utflen(lkup.kana) == 2)
-		return 1;
-	return 0;
+		return;
+	goto Again;
 }
 
 static void
@@ -597,10 +600,10 @@
 	char m[Msgsize];
 	Map lkup;
 	char *p;
-	int n, ln, rn;
+	int n;
 	Rune r;
 	char peek[UTFmax+1];
-	Str line, tbuf;
+	Str line;
 
 	peek[0] = lang = deflang;
 	resetstr(&line, nil);
@@ -628,25 +631,31 @@
 				resetstr(&line, nil);
 				continue;
 			}
-			if(lang == LangVN && utfrune(" ", r) != nil){
+			if(lang == LangEN){
+				emitutf(output, p, 1);
+				continue;
+			}
+			if(utfrune("", r) != nil){
 				resetstr(&line, nil);
-				if(r != ' ')
-					continue;
+				emitutf(dictch, p, 1);
+				continue;
 			}
-			if(lang == LangZH || lang == LangJP){
+			emitutf(output, p, 1);
+
+			switch(lang){
+			case LangZH:
 				emitutf(dictch, p, 1);
-				if(utfrune("\n", r) != nil){
-					resetstr(&line, nil);
-					continue;
-				}
-				if(lang == LangJP && isupper(*p))
+				continue;
+			case LangJP:
+				emitutf(dictch, p, 1);
+				if(isupper(*p))
 					*p = tolower(*p);
+				break;
 			}
-
-			emitutf(output, p, 1);
-			if(lang == LangEN || lang == LangZH)
+			if(utfrune("\n\t ", r) != nil){
+				resetstr(&line, nil);
 				continue;
-			if(r == '\b'){
+			} else if(r == '\b'){
 				popstr(&line);
 				continue;
 			}
@@ -653,24 +662,8 @@
 
 			line.p = pushutf(line.p, strend(&line), p, 1);
 			if(lang == LangVN){
-			Again:
-				ln = utflen(line.b);
-				switch(rn = telexlkup(&line, &tbuf)){
-				default:
-					resetstr(&line, nil);
-					continue;
-				case 2:
-					continue;
-				case 1:
-				case 0:
-					if(ln > 0)
-						emitutf(output, backspace, ln);
-					emitutf(output, tbuf.b, 0);
-					line.p = pushutf(line.b, strend(&line), tbuf.b, 0);
-					if(rn == 0)
-						goto Again;
-					continue;
-				}
+				telexlkup(&line);
+				continue;
 			}
 			if(maplkup(lang, line.b, &lkup) < 0){
 				resetstr(&line, nil);
@@ -836,7 +829,7 @@
 		selectch = nil;
 	} else {
 		selectch = chancreate(sizeof(int), 1);
-		displaych = chancreate(sizeof(char*)*16, 1);
+		displaych = chancreate(sizeof(char*)*Maxkouho, 1);
 		proccreate(displaythread, nil, mainstacksize);
 	}