(*
 * LANGUAGE    : ANS Forth
 * PROJECT     : Forth Environments
 * DESCRIPTION : generate and use a directed acyclic word graph (DAWG)
 * CATEGORY    : Tool
 * AUTHOR      : Ian Osgood
 * LAST CHANGE : Sunday, May 11, 2003 10:00 AM, Marcel Hendrix
 *)


	NEEDS -miscutil

	REVISION -dawg "ÄÄÄ DAWG                Version 1.01 ÄÄÄ"

	PRIVATES

DOC
(*
	From: iano@quirkster.com (Ian Osgood)
	Newsgroups: comp.lang.forth
	Subject: DAWG.F 0/2
	Date: 10 May 2003 18:51:42 -0700

	The following is code to generate and use a directed acyclic word
	graph (DAWG).

	A DAWG is a data structure for compressing a list of words into a form
	amenable to quick searching.  Processing a 90000 word Scrabble
	dictionary obtains a 240KB DAWG file in a handful of seconds.  The
	DAWG is suitable for use in a spell-checker, anagram finder, or word
	game.  Its use for instantly enumerating every legal play in a game of
	Scrabble is described in "The World's Fastest Scrabble Program",
	Journal of the ACM.

	The DAWG is a cell array of 32-bit "nodes".  The first cell contains
	the index of the root "block", which happens to be the final block in
	the array.  A node contains a 5-bit letter, an end-of-word flag, an
	end-of-block flag, and the 24-bit array index of a block.  A node
	represents an edge of the word graph labeled with a letter.  It also
	represents a subgraph, a set of suffixes beginning with that letter. 
	A block is a contiguous sequence of nodes representing all the
	possible next letters, or suffixes, of a prefix.

	Part 1/2 contains some base words and the DAWG generator.  It uses a
	hash table.
	Part 2/2 contains sample code for using the DAWG, including a Boggle
	game solver.  It uses a sorted string array with a binary search.

	Comments and suggestions are welcome.

	Ian Osgood

        ------------------------ BUGS ----------------------------------
	These engwords.txt words have more than 16 characters, so the dawg 
	fails to build, unless max-word-len is 24 or so.
	
	arteriolosclerosis
	contradistinction
	contradistinguish
	diethylstilbestrol
	...
	ultraconservative

*)
ENDDOC

\  make-dawg - converts the file "ospd.txt" of lowercase words,
\              sorted alphabetically one per line, into "dawg.out"

\ TRIE/DAWG node structure definition

1 #30 LSHIFT    =: EOB_MASK   PRIVATE
1 #29 LSHIFT    =: EOW_MASK   PRIVATE
1 #24 LSHIFT 1- =: INDEX_MASK PRIVATE

:INLINE Let ( node -- 1-26)      #24 RSHIFT #31 AND ; PRIVATE
:INLINE EOW ( node -- nz )       EOW_MASK   AND ; PRIVATE 
:INLINE EOB ( node -- nz )       EOB_MASK   AND ; PRIVATE
:INLINE Ind ( node -- index )    INDEX_MASK AND ; PRIVATE

:INLINE InitLet ( 1-26 -- node ) #24 LSHIFT ; PRIVATE

:INLINE let>c  ( 1-26 -- a-z )   [CHAR] a + 1- ; PRIVATE
:INLINE c>let  ( a-z -- 1-26 )   [CHAR] a - 1+ ; PRIVATE
:INLINE ?c>let ( c -- 0-26 )     DUP [CHAR] a [CHAR] z 1+ WITHIN IF  c>let  ELSE  DROP 0  ENDIF ; PRIVATE

\ utility
:INLINE CELL/   2 RSHIFT ; PRIVATE

\ common

#24 CHARS =: max-word-len  
VARIABLE word-len   PRIVATE  CREATE next-word PRIVATE max-word-len ALLOT
VARIABLE prefix-len PRIVATE  CREATE prefix    PRIVATE max-word-len ALLOT

:INLINE prefix-len+ ( addr1 -- addr2 ) prefix-len @ CHARS + ;  PRIVATE

:INLINE next-word-has-prefix? ( -- nz )
  next-word prefix-len @ prefix OVER COMPARE 0= word-len @ AND ; PRIVATE
  

\ DAWG builder
VARIABLE words-file	PRIVATE
VARIABLE dawg-file	PRIVATE
VARIABLE cur-dawg-index PRIVATE

\ [a-z] only
: legal-pad? ( -- tf )
  	       word-len @ 0= IF  TRUE  EXIT  THEN	\ EOF
  max-word-len word-len @ <  IF  FALSE EXIT  THEN
  PAD word-len @ 
  BOUNDS DO I C@ ?c>let 0= IF  FALSE UNLOOP EXIT  THEN
       LOOP TRUE ;

: get-next-word
  BEGIN PAD #80 words-file @ READ-LINE 2DROP word-len !
        legal-pad?
  UNTIL PAD next-word word-len @ CMOVE ;

: write-to-dawg ( block size -- )
  dawg-file @ WRITE-FILE ABORT" Can't write to dawg" ;P

\ Hash Table for blocks
#2311 =: hash-size PRIVATE
VARIABLE htab	   PRIVATE

: create-hash-table
  hash-size CELLS ALLOCATE ABORT" Hash table too big"
  DUP hash-size CELLS ERASE htab ! ;P

: htab@i ( hash-index -- head-entry-addr ) CELLS htab @ + ;
: ->next  ;P IMMEDIATE
:INLINE ->index CELL+ ; PRIVATE
:INLINE ->block CELL+ CELL+ ;  PRIVATE

: destroy-hash-table
  htab @ 
  hash-size 0
    DO
       DUP @
       BEGIN  ?DUP 
       WHILE  DUP ->next @  SWAP FREE DROP
       REPEAT CELL+
  LOOP 
  DROP htab @ FREE DROP ;P

\ 0 for a trie, >5 for a dawg (measured no dups above size 5)
6 CELLS =: Block-size-hash-threshold PRIVATE

: hash-block ( block -- hash )
  0 >R CELL-
  BEGIN  CELL+ DUP @ DUP R> XOR >R  EOB UNTIL
  DROP 
  R> U>D hash-size UM/MOD DROP ;P

: blocks-equivalent? ( block1 block2 -- TF )
  BEGIN  
     OVER @ OVER @ <> IF 2DROP FALSE EXIT THEN
     DUP @ EOB 0=
  WHILE  
     CELL+ SWAP CELL+
  REPEAT 
  2DROP TRUE ;P

: find-hash-block ( block -- index | 0 )
  DUP hash-block htab@i    ( block hash-block-addr )
  BEGIN @ DUP
  WHILE 2DUP ->block blocks-equivalent?
        IF ->index @ NIP EXIT
        THEN ->next
  REPEAT NIP ( 0 ) ;P
  
: add-hash-block ( size block -- )
  OVER ->block ALLOCATE ABORT" Can't allocate hash entry" 
  OVER hash-block htab@i           ( size block h head-addr )
  2DUP @ OVER ->next ! SWAP !      \ replace the head <- h
  cur-dawg-index @ OVER ->index !
  ->block ROT MOVE ;P

\ Core DAWG building algorithm

: index-for-block ( size block -- index )
  OVER Block-size-hash-threshold <
    IF DUP find-hash-block ?DUP IF  NIP NIP EXIT  THEN
       2DUP add-hash-block
  THEN
  OVER write-to-dawg  ( size )
  CELL/ cur-dawg-index @ TUCK + cur-dawg-index ! ;P

: append-next-letter-to-prefix ( -- a-z )
  next-word prefix-len+ C@
  prefix prefix-len+ 2DUP C@ <=
  IF  2DROP ABORT" Words out of order"  THEN
  2DUP C!  0 SWAP CHAR+ C!
  1 prefix-len +! ;P

: init-node-with-letter ( node a-z -- node )
  c>let InitLet OVER !
  word-len @ prefix-len @ =
    IF EOW_MASK OVER +!
       get-next-word
  THEN ;P

: remove-letter-from-prefix   -1 prefix-len +! ;P

: finish-block ( prefix-node last-node -- prefix-node )
  2DUP = IF  DROP EXIT  THEN
  EOB_MASK OVER +!
  OVER - OVER CELL+ ( prefix size block )
  index-for-block OVER +! ;P

: suffixes ( prefix-node-addr -- prefix-node-addr )
  DUP 	   ( prefix current )
  BEGIN  next-word-has-prefix?
  WHILE  CELL+		\ allocate a new node
         append-next-letter-to-prefix
         init-node-with-letter
         RECURSE	\ process all suffixes from this prefix
         remove-letter-from-prefix
  REPEAT   ( prefix last )
  finish-block ;P

\ Using ftp://puzzlers.org/pub/wordlists/engwords.txt
\    or ftp://puzzlers.org/pub/wordlists/ospd.txt
: make-dawg ( -- )
  S" ospd.txt" R/O OPEN-FILE   ABORT" No input file"  words-file !
  S" dawg.out" R/W CREATE-FILE ABORT" No output file"  dawg-file !

  create-hash-table
  #100 CELLS ALLOCATE ABORT" Can't allocate block stack"
  ( blocks ) 			\ max 87 for "outstunting"

  0 OVER ! DUP 4 write-to-dawg	\ skip root pointer
  1 cur-dawg-index !
  get-next-word
  0 prefix-len !  0 prefix C!
  suffixes  ( blocks[0] filled with root node index )
  0. dawg-file @ REPOSITION-FILE ABORT" Can't rewind"
  DUP 4 write-to-dawg		\ backpatch root pointer

  FREE DROP  destroy-hash-table
  dawg-file  @ CLOSE-FILE DROP
  words-file @ CLOSE-FILE DROP ;


\ DAWG usage utilities  (start session with "load-dawg")

\ Top level commands
\  load-dawg    - load a trie/dawg into memory
\  unload-dawg
\  tdtrav       - interactively traverse a TRIE/DAWG
\  word?        - lookup a word in the dawg

VARIABLE dawg PRIVATE

: read-trie ( fname count -- trie^ code )
  R/O OPEN-FILE            ?DUP IF EXIT THEN ( file )
  DUP FILE-SIZE            ?DUP IF EXIT THEN ( file udsize)
  D>S DUP ALLOCATE         ?DUP IF EXIT THEN ( file size mem^ )
  DUP 2OVER SWAP READ-FILE ?DUP IF EXIT THEN ( file size mem read )
  ROT <>                   ?DUP IF EXIT THEN ( file mem )
  SWAP CLOSE-FILE ;P

: load-dawg    S" ../dawg/dawg.out" read-trie ABORT" Can't load dawg" dawg ! ;
: unload-dawg  dawg @ FREE DROP ;

:INLINE dawg-root ( -- root-block )  dawg @ DUP @ CELLS + ; PRIVATE
:INLINE dawg@i    ( index -- block ) CELLS dawg @ + ; PRIVATE

: .prefix  ." '" prefix prefix-len @ TYPE ." '" ;P

: letter-in-block ( letter block-addr -- node-addr | 0 )
  SWAP LOCAL letter
  CELL-
  BEGIN 
     CELL+ DUP @ Let letter - DUP 0= IF  DROP   EXIT  ENDIF
     				 0>= IF  DROP 0 EXIT  ENDIF
     DUP @ EOB 
  UNTIL
  DROP 0 ;P

\
\ TRIE/DAWG checker
\

: .block ( block -- )
	CELL- 
  BEGIN CELL+ DUP @
        DUP EOW IF [CHAR] A  ELSE  [CHAR] a  THEN
        OVER Let 1- + EMIT
        EOB
  UNTIL DROP ;P

: trav ( index -- command[0^-.] )
	DUP 0= IF  .prefix ."  End of line." CR EXIT  THEN
  	CELLS dawg @ + 0  ( block^ command )
  BEGIN DROP .prefix ."  [" DUP .block ." ^-.] "
        KEY CR
        DUP [CHAR] a [CHAR] z 1+ WITHIN 
          IF DUP prefix prefix-len+ C!
             c>let OVER letter-in-block DUP
               IF 1 prefix-len +!
                  @ Ind RECURSE
                  -1 prefix-len +!
                  DUP [CHAR] - = OVER 8 = OR OVER 127 = OR 
                  IF  DROP 0  THEN
             THEN
        THEN
        DUP [CHAR] ^ =
        OVER [CHAR] - = OR OVER 8 = OR OVER 127 = OR
        OVER [CHAR] . = OR
  UNTIL NIP ;P

: tdtrav ( -- )
  0 prefix-len !  BEGIN dawg @ @ trav [CHAR] . = UNTIL ;

\
\ spell check
\

: is-word? ( addr len -- TF )
  BOUNDS dawg @  ( end cur node-addr )
  BEGIN
        @ Ind  DUP 0= IF  3DROP FALSE EXIT  ENDIF 	\ word too long
        dawg@i OVER C@ c>Let SWAP letter-in-block
               DUP 0= IF  3DROP FALSE EXIT  ENDIF 	\ word not found
        >R CHAR+ 2DUP = R> SWAP
  UNTIL
  2NIPS @ EOW 0<> ;					\ word maybe too short

: word?  BL PARSE is-word? IF ." Yes" ELSE ." No" THEN ;

-- A string with spaces
: $QUALIFY? ( c-addr u -- bool )
	2DUP BL (lex) 0= IF  IS-WORD? EXIT  ENDIF
	( c-addr u c-addr2 u2 c-addr1 u1 del ) DROP 
	IS-WORD? 0= IF  4DROP FALSE EXIT  ENDIF
	2SWAP 2DROP RECURSE ;

[DEFINED] testing 
  [IF]

\ FORTH> mini-bench
\ 50000 searches in 0.552 seconds elapsed.
\ 40000 positive results. ok

\ FORTH> mini-bench
\ 50000 searches in 0.264 seconds elapsed.
\ 40000 positive results. ok
: MINI-BENCH ( -- )
	0 LOCAL res
	#10000 LOCAL #times
	LOAD-DAWG
	CR 5 #times * DEC. ." searches in " TIMER-RESET
	#times 0 DO  S" arteriolosclerosis" is-word? 1 AND +TO res 
		     S" zygote" 	    is-word? 1 AND +TO res 
		     S" 123" 		    is-word? 1 AND +TO res 
		     S" conceit" 	    is-word? 1 AND +TO res 
		     S" forth" 		    is-word? 1 AND +TO res 
	       LOOP
	.ELAPSED CR res DEC. ." positive results." ;

\ Boggle sample program
\  random-board - fill the board with random letters
\  fill-board   - set the board to a particular state
\  .board       - show the board
\  solve-board  - use the DAWG to find all the words
\                 of length min-len or greater

4 VALUE min-len

6 5 * 1+ CHARS =: board-size  PRIVATE
CREATE board PRIVATE board-size ALLOT
\ 0 ,  0 ,  0 ,  0 ,  0 ,
\ 0 ,  1 ,  2 ,  3 ,  4 ,
\ 0 ,  5 ,  6 ,  7 ,  8 ,
\ 0 ,  9 , 10 , 11 , 12 ,
\ 0 , 13 , 14 , 15 , 16 ,
\ 0 ,  0 ,  0 ,  0 ,  0 , 0 ,

\ UI

: .line   CHARS board + 4 TYPE CR ;P ( index -- )
: .board  CR 6 .line #11 .line #16 .line #21 .line ; ( -- )

: fill-line ( index "abcd" -- ) 
  	BL PARSE 4 MIN ROT CHARS board + SWAP CMOVE ;P

: fill-board  ( -- )
	board board-size ERASE
  	6 fill-line #11 fill-line #16 fill-line #21 fill-line .board ;

: rand-letter ( -- a-z ) 
	#26 CHOOSE 1+ let>c ;P

: rlc!+ ( sq -- sq+1 ) 
	rand-letter OVER C! CHAR+ ;P

: rand-line ( index -- )
  CHARS board + rlc!+ rlc!+ rlc!+ rlc!+ DROP ;P

: random-board ( -- )
	board board-size ERASE
	6 rand-line #11 rand-line #16 rand-line #21 rand-line .board ;

\ results (sorted list, unique words)

0 VALUE found-words	PRIVATE
0 VALUE size-words	PRIVATE	\ allocated size
0 VALUE num-words	PRIVATE

: grow-words ( -- )
	size-words 0= IF  #16 DUP CELLS ALLOCATE ?ALLOCATE ( -- 16 addr )
		    ELSE  size-words 2*  found-words OVER CELLS RESIZE ?ALLOCATE
  		    THEN  
	TO found-words 
	TO size-words ;P

: allocate-string ( addr len -- c-str )
  	DUP 1+ ALLOCATE ?ALLOCATE PACK ;P
	\ DUP >R ( addr len caddr ) 2DUP C! CHAR+ SWAP CMOVE R> ;P

: insert-word ( n addr len -- ) 
	num-words size-words = IF  grow-words  THEN
	allocate-string
	( n c-str -- ) SWAP DUP >R CELLS found-words +
	( c-str fw+ncells ) DUP DUP CELL+ num-words R> - CELLS MOVE
	( c-str fw+ncells ) !  
	num-words 1+ TO num-words ;P

: add-word ( addr len -- ) ( / binary search )
  	2>R num-words 0 
  	BEGIN 2DUP - 
	WHILE 2DUP + 2/  DUP 2R@ ROT CELLS found-words + @ COUNT COMPARE
    	      DUP 0= IF  4DROP 2R> 2DROP EXIT  
	      	   THEN
    	          0< IF  ROT DROP SWAP 
	           ELSE  1+ NIP 
	           THEN
  	REPEAT 
	DROP 2R> insert-word ;P

: add-prefix  ( -- )
	prefix prefix-len @ add-word ;P

: clear-words ( -- )
  	num-words 0 ?DO	I CELLS found-words + @ FREE DROP
  	     	   LOOP  
	0 TO num-words ;P
  
: .words ( -- )
	num-words 0 ?DO I CELLS found-words + @ COUNT TYPE SPACE
  		   LOOP 
	CR ;

\ smarts

: solve-square ( block sq -- block sq )
  DUP C@ 0= ?EXIT 		\ edge or already used
  \ can traverse to letter on this square?
  2DUP C@ c>let SWAP letter-in-block ?DUP 0= ?EXIT
  \ OK: add letter to prefix  ( sq block-node )
  OVER C@ prefix prefix-len+ C!  1 prefix-len +!
  \ found a word?
  DUP @ EOW IF min-len prefix-len @ <= IF add-prefix THEN THEN
  \ no more suffixes?
  @ Ind ?DUP 0= IF -1 prefix-len +! EXIT THEN
  \ continue to surrounding squares
  dawg@i OVER ( next-block next-sq )
  0 OVER C!			\ mark used
  6 CHARS - RECURSE  CHAR+ RECURSE  CHAR+ RECURSE
  3 CHARS + RECURSE             2 CHARS + RECURSE
  3 CHARS + RECURSE  CHAR+ RECURSE  CHAR+ RECURSE
  2DROP -1 prefix-len +!	\ mark usable again
  prefix prefix-len+ C@ OVER C! ;P

: solve-line ( root sq -- root sq+5 )
  solve-square CHAR+ solve-square CHAR+
  solve-square CHAR+ solve-square CHAR+ CHAR+ ;P

: solve-board
  0 prefix-len !
  clear-words  dawg-root  6 CHARS board +
  solve-line solve-line solve-line solve-line  2DROP
  CR .words ;

[THEN]

:ABOUT	CR .~ make-dawg           -- converts "ospd.txt" of lowercase words,~
	CR .~                        sorted alphabetically one per line, into "dawg.out"~
	CR ." load-dawg           -- load the dawg file made by make-dawg (dawg.out)." 
[ [DEFINED] testing ]
  [IF]  CR ." Application: BOGGLE ( after load-dawg )"
	CR ." Usage: random-board "
	CR ."  or    fill-board abcd efgh ijkl mnop"
	CR ." Then:  solve-board  -- solve the boggle board" 
[THEN]	;

                .ABOUT -dawg CR
		DEPRIVE
					(* End of Source *)