1
1
Fork 0
mirror of https://github.com/QB64-Phoenix-Edition/QB64pe.git synced 2024-07-09 22:25:12 +00:00

Improve Entity/UTF-8 handling

- UTF-8 handling changed to signal missing/unknown chars
- old entity handling (see 1st commit) removed and replaced by a similar method as UTF-8 handling incl. signalling missing/unknown entities
- added dynamic warnings at the top of pages which are using missing/unknown entities or UTF-8
This commit is contained in:
Roland Heyder 2023-02-02 00:10:31 +01:00
parent 01ed59c1d3
commit 9c7e067990
2 changed files with 77 additions and 8 deletions

View file

@ -52,12 +52,30 @@ DIM SHARED Help_Search_Str AS STRING
DIM SHARED Help_PageLoaded AS STRING
DIM SHARED Help_Recaching, Help_IgnoreCache
'HTML entity replacements
'(for non HTML chars only, ie. no & < > " which are handled in SUB Wiki$ directly)
TYPE wikiEntityReplace
enti AS STRING * 8 '= entity as supported (ie. name where available, else as decimal number)
repl AS STRING * 8 '= replacement string (1-8 chars)
END TYPE
DIM SHARED wpEntRepl(0 TO 10) AS wikiEntityReplace
DIM SHARED wpEntReplCnt: wpEntReplCnt = -1 'wpEntRepl index counter (pre-increment, hence
'you don't need "wpEntReplCnt - 1" when used in loops, just do "0 TO wpEntReplCnt"
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "'": wpEntRepl(wpEntReplCnt).repl = "'" 'apostrophe
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "[": wpEntRepl(wpEntReplCnt).repl = "[" 'open square bracket
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "]": wpEntRepl(wpEntReplCnt).repl = "]" 'close square bracket
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "{": wpEntRepl(wpEntReplCnt).repl = "{" 'open curly bracket
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "}": wpEntRepl(wpEntReplCnt).repl = "}" 'close curly bracket
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "π": wpEntRepl(wpEntReplCnt).repl = CHR$(227) 'pi
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = "θ": wpEntRepl(wpEntReplCnt).repl = CHR$(233) 'theta
wpEntReplCnt = wpEntReplCnt + 1: wpEntRepl(wpEntReplCnt).enti = " ": wpEntRepl(wpEntReplCnt).repl = CHR$(255) 'non-breaking space
'Unicode replacements
TYPE wikiUtf8Replace
utf8 AS STRING * 4 '= MKI$(reversed hex 2-byte UTF-8 sequence) or MKL$(reversed hex 3/4-byte UTF-8 sequence)
repl AS STRING * 8 '= replacement string (1-8 chars)
END TYPE
DIM SHARED wpUtfRepl(0 TO 50) AS wikiUtf8Replace
DIM SHARED wpUtfRepl(0 TO 40) AS wikiUtf8Replace
DIM SHARED wpUtfReplCnt: wpUtfReplCnt = -1 'wpUtfRepl index counter (pre-increment, hence
'you don't need "wpUtfReplCnt - 1" when used in loops, just do "0 TO wpUtfReplCnt"
'Note: All UTF-8 values must be reversed in MKI$/MKL$, as it flips them to little endian.

View file

@ -250,7 +250,7 @@ SUB WikiParse (a$) 'Wiki page interpret
Help_Center = 0: Help_CIndent$ = ""
Help_DList = 0: Help_ChkBlank = 0
link = 0: elink = 0: cb = 0: nl = 1: hl = 0: ah = 0: dl = 0
link = 0: elink = 0: ue = 0: uu = 0: cb = 0: nl = 1: hl = 0: ah = 0: dl = 0
col = Help_Col
@ -865,33 +865,49 @@ SUB WikiParse (a$) 'Wiki page interpret
END IF
END IF
'HTML entity handling (no restrictions)
IF c$ = "&" THEN 'possible entity
FOR ii = 0 TO wpEntReplCnt
ent$ = RTRIM$(wpEntRepl(ii).enti)
IF c$(LEN(ent$)) = ent$ THEN
Help_AddTxt RTRIM$(wpEntRepl(ii).repl), col, 0
i = i + LEN(ent$) - 1: GOTO charDone
END IF
NEXT
ii = INSTR(c$(8), ";"): iii = INSTR(c$(8), " ") 'unknown entity?
IF ii > 0 AND (iii = 0 OR iii > ii) THEN
Help_AddTxt c$(ii), 8, 0: ue = -1
i = i + ii - 1: GOTO charDone
END IF
END IF
'Unicode handling (no restrictions)
IF ((c AND &HE0~%%) = 192) AND ((ASC(c$(2), 2) AND &HC0~%%) = 128) THEN '2-byte UTF-8
i = i + 1
FOR ii = 0 TO wpUtfReplCnt
IF wpUtfRepl(ii).utf8 = c$(2) + MKI$(&H2020) THEN
Help_AddTxt RTRIM$(wpUtfRepl(ii).repl), col, 0: EXIT FOR
Help_AddTxt RTRIM$(wpUtfRepl(ii).repl), col, 0: GOTO charDone
END IF
NEXT
GOTO charDone
Help_AddTxt CHR$(168), 8, 0: uu = -1: GOTO charDone
END IF
IF ((c AND &HF0~%%) = 224) AND ((ASC(c$(2), 2) AND &HC0~%%) = 128) AND ((ASC(c$(3), 3) AND &HC0~%%) = 128) THEN '3-byte UTF-8
i = i + 2
FOR ii = 0 TO wpUtfReplCnt
IF wpUtfRepl(ii).utf8 = c$(3) + CHR$(0) THEN
Help_AddTxt RTRIM$(wpUtfRepl(ii).repl), col, 0: EXIT FOR
Help_AddTxt RTRIM$(wpUtfRepl(ii).repl), col, 0: GOTO charDone
END IF
NEXT
GOTO charDone
Help_AddTxt CHR$(168), 8, 0: uu = -1: GOTO charDone
END IF
IF ((c AND &HF8~%%) = 240) AND ((ASC(c$(2), 2) AND &HC0~%%) = 128) AND ((ASC(c$(3), 3) AND &HC0~%%) = 128) AND ((ASC(c$(4), 4) AND &HC0~%%) = 128) THEN '4-byte UTF-8
i = i + 3
FOR ii = 0 TO wpUtfReplCnt
IF wpUtfRepl(ii).utf8 = c$(4) THEN
Help_AddTxt RTRIM$(wpUtfRepl(ii).repl), col, 0: EXIT FOR
Help_AddTxt RTRIM$(wpUtfRepl(ii).repl), col, 0: GOTO charDone
END IF
NEXT
GOTO charDone
Help_AddTxt CHR$(168), 8, 0: uu = -1: GOTO charDone
END IF
'Line break handling (no restrictions)
@ -940,6 +956,41 @@ SUB WikiParse (a$) 'Wiki page interpret
LOOP
'END_PARSE_LOOP
'Write and rearrange Entity & Unicode error messages (if any)
IF ue OR uu THEN
Help_LinkN = Help_LinkN + 1
Help_Link$ = Help_Link$ + "EXTL:https://qb64phoenix.com/forum/forumdisplay.php?fid=25" + Help_Link_Sep$
stp = CVL(RIGHT$(Help_Line$, 4))
Help_AddTxt STRING$(Help_ww, 196), 14, 0: Help_NewLine
itp = CVL(MID$(Help_Line$, 13, 4)): dtl = CVL(RIGHT$(Help_Line$, 4)) - stp
txt$ = MID$(Help_Txt$, stp, dtl) + MID$(Help_Txt$, itp, stp - itp): MID$(Help_Txt$, itp, LEN(txt$)) = txt$
Help_Line$ = LEFT$(Help_Line$, 12) + MKL$(itp) + MID$(Help_Line$, 13, LEN(Help_Line$) - 16)
FOR i = 17 TO LEN(Help_Line$) STEP 4: MID$(Help_Line$, i, 4) = MKL$(CVL(MID$(Help_Line$, i, 4)) + dtl): NEXT
IF uu THEN
stp = CVL(RIGHT$(Help_Line$, 4))
Help_AddTxt "!>", 4, 0
Help_AddTxt " Page uses ", Help_Col_Normal, 0
Help_AddTxt "unknown UTF-8 characters", 8, 0
Help_AddTxt ", please report it in the ", Help_Col_Normal, 0
Help_AddTxt "Wiki Forum.", Help_Col_Link, Help_LinkN: Help_NewLine
itp = CVL(MID$(Help_Line$, 13, 4)): dtl = CVL(RIGHT$(Help_Line$, 4)) - stp
txt$ = MID$(Help_Txt$, stp, dtl) + MID$(Help_Txt$, itp, stp - itp): MID$(Help_Txt$, itp, LEN(txt$)) = txt$
Help_Line$ = LEFT$(Help_Line$, 12) + MKL$(itp) + MID$(Help_Line$, 13, LEN(Help_Line$) - 16)
FOR i = 17 TO LEN(Help_Line$) STEP 4: MID$(Help_Line$, i, 4) = MKL$(CVL(MID$(Help_Line$, i, 4)) + dtl): NEXT
END IF
IF ue THEN
stp = CVL(RIGHT$(Help_Line$, 4))
Help_AddTxt "!>", 4, 0
Help_AddTxt " Page uses ", Help_Col_Normal, 0
Help_AddTxt "unknown HTML entities", 8, 0
Help_AddTxt ", please report it in the ", Help_Col_Normal, 0
Help_AddTxt "Wiki Forum.", Help_Col_Link, Help_LinkN: Help_NewLine
itp = CVL(MID$(Help_Line$, 13, 4)): dtl = CVL(RIGHT$(Help_Line$, 4)) - stp
txt$ = MID$(Help_Txt$, stp, dtl) + MID$(Help_Txt$, itp, stp - itp): MID$(Help_Txt$, itp, LEN(txt$)) = txt$
Help_Line$ = LEFT$(Help_Line$, 12) + MKL$(itp) + MID$(Help_Line$, 13, LEN(Help_Line$) - 16)
FOR i = 17 TO LEN(Help_Line$) STEP 4: MID$(Help_Line$, i, 4) = MKL$(CVL(MID$(Help_Line$, i, 4)) + dtl): NEXT
END IF
END IF
'Trim Help_Txt$
Help_Txt$ = LEFT$(Help_Txt$, Help_Txt_Len) + CHR$(13) 'chr13 stops reads past end of content