% Copyright 1989 by Norman Ramsey, Odyssey Research Associates % Not to be sold, but may be used freely for any purpose % For more information, see file COPYRIGHT in the parent directory \message{OK, entering \string\batchmode...} \batchmode \let\RA\rightarrow \def\vert{{\tt\char'174}} \def\pb{$\.|\ldots\.|$} % C brackets (|...|) \def\title{SPIDER} \def\topofcontents{\null\vfill \titlefalse % include headline on the contents page \def\rheader{\hfil} \centerline{\titlefont The {\ttitlefont SPIDER} processor} \vfill} \def\syntax##1{\leavevmode\hbox{$\langle\hbox{\sl ##1\/}\rangle$}} \def\produces{\leavevmode\hbox{${}::={}$}} \def\opt##1{$[$##1$]$} #*={\tt SPIDER} proper. #*Introduction. This is an AWK program designed to read a description of a programming language and to write out the language-dependent parts of WEB. In the main, the description of a programming language is a list of all the tokens of the language (together with various facts about them) and a grammar for prettyprinting code fragments written in that language. The ``Spider User's Guide'' describes how to use {\tt SPIDER} to construct a {\tt WEB} system for the ALGOL-like language of your choice. ({\tt SPIDER} may be able to handle LISP and Miranda and other strange languages; the experiment hasn't been tried. The unusual lexical requirements of FORTRAN are probably beyond it, at least until the lexical analysis is modernized.) # The outline of the program is fairly straightforward. We use |exitcode| throughout to monitor error status. If we were more Knuthlike, we would have a |history| variable with values of |spotless|, and so on. This will have to wait until we get macros back into \.{TANGLE}. We put the pattern-action statement for productions last, because in case of a conflict like \.{token~-->~...}, we want the interpretation as {\tt token} to win out over the interpretation as a prodution. #u#1 BEGIN { # exitcode=0 } #@ # # # # #@ END { # print "Writing out lists" > logfile # # # if (exitcode != 0) { exit exitcode } } # There are a couple of actions we may want to perform with just about any command. If a command fails, we move on to the next, but we remember the fault so we can complain at the end. #= exitcode=-1 next # Throughout \.{SPIDER} we always use the variable |i| to step through the fields of a command, so that |$i| is always the next field of interest. When we thinik we have finished a command, we will always want to check to make sure there are no unexamined fields left over. For error messages, the line number is really |NR-1|, since we put an extra line at the beginning with the date. #= if (i<=NF) { print "Error: leftover fields", $i, "... on line", NR-1 # } # To \.{SPIDER}, any line beginning with |"## "| is a comment. \.{SPIDER} also ignores blank lines. #= #=/^##|^ *$/#> { ## comments, blank lines print $0 > logfile next } # But, if \.{SPIDER} encounters a line we don't recognize, it complains. #= { print "Warning: I don't know what to do with this line:" print " ", $0 print "Warning: I don't know what to do with this line:" > logfile print " ", $0 > logfile } #*1Files written by {\tt SPIDER}. {\tt SPIDER} writes output to a number of files. Because 4.3~BSD AWK is limited in the number of files it can write at one time, there is substantial overlap. Here is a table: \noindent\halign{\vrule height10pt depth3.5pt width0pt \it##\hfil\tabskip=1em&\tt##\hfil&\tabskip=0pt \hsize=4in\vtop{\noindent##\strut\par}\cr \noalign{\medskip} \bf Internal Name&\bf External Name&\omit\bf Description\hfil\cr \noalign{\smallskip} categoryfile&names.unsorted& names of categories, to be checked for duplicates by {\tt nodups.awk} \cr cycles&cycle.test& potential cycles, to be checked by {\tt cycle.web} \cr grammarfile&grammar.web& grammar; included in {\tt weave.web} \cr ilkfile&names.unsorted& names of ilks, to be checked for duplicates by {\tt nodups.awk} \cr logfile&spider.slog& log file, to be consulted when things go wrong \cr macrofile&*web.tex& language specific macro file, {\tt\string\input} by all \TeX{} files created by {\tt weave.web} \cr productions&productions.list& list of the productions (numbered) used in debugging \.{WEAVE} \cr reserved&scraps.web& code for converting the reserved word to scraps. {\tt scraps.web} is included by {\tt weave.web} \cr scrapfile&scraps.web& code for converting tokens to scraps. {\tt scraps.web} is included by {\tt weave.web} \cr tlang&outtoks.web& Information about what language we're webbing. {\tt outtoks.web} is included by {\tt tangle.web}. \cr tokennamefile&names.unsorted& list of names of all the tokens, to be checked by {\tt nodups.awk} \cr translationfile&trans\_keys.unsorted& list of names of all the translation keywords. Checked for duplicates by {\tt nodups.awk}, and also for recognizability by {\tt transcheck.awk}. \cr ttokfile&outtoks.web& This is the tokenization code for {\tt TANGLE}. \cr wlang&scraps.web& Information about what language we're webbing, {\tt scraps.web} is included by {\tt weave.web}. \cr } # Every action writes information to a log file. This log file can be used to check up on what happened. #= logfile = "spider.slog" # Here we write the names of the key words used in translations. #= translationfile = "trans_keys.unsorted" # We write tokens out to two files: |scrapfile| for \.{WEAVE}, and |ttokfile| for \.{TANGLE}. #= scrapfile = "scraps.web" print "@*Scrap code generated by {\\tt SPIDER}." > scrapfile ttokfile = "outtoks.web" print "@*Token code generated by {\\tt SPIDER}." > ttokfile # The reserved word stuff gets a file of its own, or it would in an ideal world. #= reserved = "scraps.web" ## use same file; not enough files # We'll also end up writing a list of token names, for name checking purposes. #= tokennamefile = "names.unsorted" ## cut down on number of output files # We also write out every ilk, so we'll be able to look for name clashes with translations and so on. #= ilkfile = "names.unsorted" ## cut down on number of output files # We also write all the category names to a separate file, so we can check for duplicates later. #= categoryfile = "names.unsorted" ## cut down on number of output files # We use a special file to write grammar information: #= grammarfile = "grammar.web" print "@*Grammar code generated by {\\tt SPIDER}." > grammarfile # We use the language information to write banners and macro information. We combine this with other stuff because AWK can't handle more than 10 output files. #= tlang = "outtoks.web" ## same as ttokfile wlang = "scraps.web" ## same as scrapfile # We will write a list of the successfully parsed productions to a separate file. #= productions = "productions.list" # These productions will get fed to {\tt cycle.awk}, which looks for cycles. #= cycles = "cycle.test" #*Processing translations. Translations tell \.{WEAVE} or \.{TANGLE} what to write out in particular circumstances (e.g.~after scanning a particular token, or when firing some production). They are described at some length in the ``\.{SPIDER} User's Guide.'' Translations are enclosed in angle brackets and separated by dashes. They can contain key words, digits, the self marker~`{\tt*}', or quoted strings. Since we can't put a space or dash into strings, we allow the use of key words |space| and |dash| to stand for those symbols. #^space#> #^dash#> Other key words are interpreted by \.{WEAVE} as prettyprinting instructions: \yskip\hang |break_space| denotes an optional line break or an en space; \yskip\hang |force| denotes a line break; \yskip\hang |big_force| denotes a line break with additional vertical space; \yskip\hang |opt| denotes an optional line break (with the continuation line indented two ems with respect to the normal starting position)---this code is followed by an integer |n|, and the break will occur with penalty $10n$; \yskip\hang |backup| denotes a backspace of one em; \yskip\hang |cancel| obliterates any |break_space| or |force| or |big_force| tokens that immediately precede or follow it and also cancels any |backup| tokens that follow it; \yskip\hang |indent| causes future lines to be indented one more em; \yskip\hang |outdent| causes future lines to be indented one less em. \yskip\hang |math_rel|, |math_bin|, and |math_op| will be translated into \.{\\mathrel\{}, \.{\\mathbin\{}, and \.{\\mathop\{}, respectively. \yskip\noindent All of these tokens are removed from the \TeX\ output that comes from programming language text between \pb\ signs; |break_space| and |force| and |big_force| become single spaces in this mode. %The translation of other %program texts results in \TeX\ %control sequences \.{\\1}, \.{\\2}, %\.{\\3}, \.{\\4}, \.{\\5}, \.{\\6}, %\.{\\7} corresponding respectively to %|indent|, |outdent|, |opt|, %|backup|, |break_space|, |force|, and %|big_force|. However, A sequence of consecutive `\.\ ', |break_space|, |force|, and/or |big_force| tokens is first replaced by a single token (the maximum of the given ones). %Some Other control sequences in the \TeX\ output will be %`\.{\\\\\{}$\,\ldots\,$\.\}' %surrounding identifiers, `\.{\\\&\{}$\,\ldots\,$\.\}' surrounding %reserved words, `\.{\\.\{}$\,\ldots\,$\.\}' surrounding strings, %`\.{\\C\{}$\,\ldots\,$\.\}$\,$|force|' surrounding comments, and %`\.{\\X$n$:}$\,\ldots\,$\.{\\X}' surrounding module names, where %|n| is the module number. # We write out the names of all the key words used translations, so we can check that \.{WEAVE} can be expected to recognize them. This helps us catch the problem early if a translation given is not one of the above (as opposed to, say, having the C~compiler fail to compile \.{WEAVE}). #= for (t in translation_keywords) { print t > translationfile } # #= for (t in translation_keywords) { num_of_translation_keywords++ } printf "You used %d translation keywords.\n", \ num_of_translation_keywords > logfile printf "You used %d translation keywords.\n", num_of_translation_keywords # If the macro facility worked right, we would use the following patterns to recognize items as they occur: #d cat_pattern = #=/[a-zA-Z][a-zA-Z_]*/#> #d trans_pattern = #=/<(([0-9]|[a-zA-Z][a-zA-Z_]*|"([^"]*\\")*[^"]*"|\*)-)*#>#& #=([0-9]|[a-zA-Z][a-zA-Z_]*|"([^"]*\\")*[^"]*"|\*)>/#> # Here's where we swallow a translation and spit out the \.{WEAVE} code to handle that translation. Since AWK has no functions, we define this as a module. When we're appending a key word {\it in the process of creating a scrap from a token}, we use |small_app| in preference to |app|, because |app|'s cleverness about mathness and dollar signs only works when reducing existing scraps, not when creating scraps from tokens. We'll expect the variable |append_keyword| to be set to either |"small_app"| or |"app"|. #"|#>= temp = substr(transstring,2,length(transstring)-2) ## kills awk bug trcnt = split(temp,trtok,"-") outstring = "" for (tridx=1;tridx<=trcnt;tridx++) { alternate=trtok[tridx] # if (alternate ~ #=/^[0-9]$/#>) { ## digit temp = sprintf("\tapp_str(\"%s\");\n",alternate) outstring=outstring temp } else if (alternate ~ #=/^[a-zA-Z_]+$/#>) { ## key word translation_keywords[alternate]=1 ## remember temp = sprintf("\t%s(%s);\n",append_keyword,alternate) ##Call |app| or |small_app| depending whether we're reducing or creating scraps outstring=outstring temp } else if (alternate ~ #=/^\"([^"]*\\\")*[^"]*\"$/#>) { ## string temp = sprintf("\tapp_str(%s);\n",alternate) outstring=outstring temp } else if (alternate ~ #=/^\*$/#>) { ## self marker # outstring=outstring selfstring } else { print "Bogus translation", wherestring exitcode = -1 } } # Here we convert the key words |space| and |dash| to strings. We quote the strings, to be sure that they are handled by the string mechanism. #= if (alternate=="space") { alternate="\" \"" } else if (alternate=="dash") { alternate="\"-\"" } # There are some places (notably in productions) where the translation |"<*>"| makes no sense. In this case the caller sets |selfstring=""|, and we complain. #= if (selfstring=="") { print "Translation \"<*>\" makes no sense", wherestring exitcode = -1 } # There are times when we may want to convert a translation directly into a quoted string, usually for \.{TANGLE}'s benefit. Here, the only things allowed are quoted strings and |space| and |dash|. We peel off quote marks and concatenate things together, and then we put the quote marks back on at the end. #= temp = substr(transstring,2,length(transstring)-2) ## kills awk bug trcnt = split(temp,trtok,"-") outstring = "" for (tridx=1;tridx<=trcnt;tridx++) { alternate=trtok[tridx] # if (alternate ~ #=/^[0-9]$/#>) { ## digit print "Digit not allowed in restricted translation", wherestring exitcode = -1 } else if (alternate ~ #=/^[a-zA-Z_]+$/#>) { ## key word print "Key word not allowed in restricted translation", wherestring exitcode = -1 } else if (alternate ~ #=/^\"([^"]*\\\")*[^"]*\"$/#>) { ## string temp = substr(alternate,2,length(alternate)-2) ## strip quotes outstring=outstring temp } else if (alternate ~ #=/^\*$/#>) { ## self marker print "<*> not allowed in restricted translation", wherestring exitcode = -1 } else { print "Bogus restricted translation", wherestring exitcode = -1 } } outstring = "\"" outstring "\"" ## put quotes back on |outstring| #*Tokens. Tokens are pretty complicated. Each token has a string by which we recognize it in the input. This string is what immediately follows the |token| command. Then, there's another string that tells \.{TANGLE} how to write out the token. Finally, it has a category and a translation (so we can make a scrap out of it), and a mathness (to tell us whether it has to be in math mode, horizontal mode, or either). The \.{translation} and \.{mathness} have defaults. #*2Scanning for token descriptions. This module is used everywhere we must scan a line for token descriptions. #= for (i=start_place;i } } # # We check for the presence or absence of certain empty strings after scanning. #= if (this_name != "") { print "Error: name doesn't apply on line", NR-1 # } # #= if (this_tangleto != "") { print "Error: tangleto doesn't apply on line", NR-1 # } # #= if (this_category != "") { print "Error: category doesn't apply on line", NR-1 # } # #= if (this_translation != "") { print "Error: translation doesn't apply on line", NR-1 # } # #= if (this_category == "") { print "Error: you must give a category on line", NR-1 # } #*1Setting the default token descriptions. \.{SPIDER} maintains default information about {\em mathness} and {\em translation}, so these can be omitted from token descriptions. We can change the operative defaults at any time by using a |"default"| command. It too, scans for keywords, using the standard scanning module #= #=/^default /#> { print "Setting defaults..." > logfile start_place=2 # # # # default_translation=this_translation default_mathness=this_mathness #@ print "\tdefault translation is", default_translation > logfile print "\tdefault mathness is", default_mathness > logfile #@ next } # Normally, we will set all quantities to the defaults before scanning: #= this_translation=default_translation this_mathness=default_mathness this_name="" this_category="" this_tangleto="" # When \.{SPIDER} starts up, the defaults are already set: #= default_translation="<*>" default_mathness="maybe" #*1Recognizing token designators. Let's begin by discussing the way \.{WEAVE} and \.{TANGLE} represent tokens internally. \.{WEAVE} and \.{TANGLE} process tokens in a two-step process. Both read the token from the input using |get_next|, which returns a unique eight-bit number representing the token. Generally printable ASCII characters represent themselves, and other tokens get numbers in the unprintable range. \.{TANGLE} assigns ranges to some tokens ahead of time: |string| is 2, |identifier| is #'202, and so on. Tokens that we introduce to \.{TANGLE} must have numbers between #'13 and #'37 inclusive. Rather than work with eight-bit numbers themselves, we use names for the tokens. This makes \.{WEAVE} and \.{TANGLE} easier to debug when things go wrong. In \.{WEAVE}, the category, mathness, and translation are all attached to a scrap based on the eight-bit number returned by |get_next|, and this is done at a later time. In \.{TANGLE}, characters are written to the output file(s) based on the token code, which can be either eight bits for simple tokens or sixteen for identifiers and things. Our mission in this section will be to read in all the token information from the {\tt token} command, and to create the names and numbers used by \.{WEAVE} and \.{TANGLE} to represent the tokens. In the next section we will write the code that processes the tokens for both \.{WEAVE} and \.{TANGLE} (lexical analysis in |get_next|, and subsequent processing elsewhere). You will pardon us if things get a bit tedious. # The {\tt token} command is used to specify tokens that are not reserved words. Reserved word tokens get special treatment all their own. #= #=/^token /#> { print "Token", $2 > logfile if ($2=="identifier") { # } else if ($2=="number") { # } else if ($2=="newline") { # } else if ($2=="pseudo_semi") { # } else if ($2 ~ #=/[a-zA-ZA-Z0-9]+/#>) { ## we recognize no other names print "Error: unknown token species:", $2 # } else { # } categories[this_category]=1 ## is this right? #^questions#> next } # Identifiers, numbers (and string literals), newlines, and the special token \.{pseduo\_semi} are predefined. #= # this_translation="" start_place=3 # # # # # id_category=this_category id_mathness=this_mathness # We have yet to implement a separate procedure for numerics and strings! #= print "Warning: numeric constants and strings are",\ "identified in this WEAVE." print "Warning: numeric constants and strings are",\ "identified in this WEAVE." > logfile # this_translation="" start_place=3 # # # # # number_category=this_category number_mathness=this_mathness # #= # start_place=3 # # # # newline_category=this_category newline_mathness=this_mathness newline_translation=this_translation # #= # start_place=3 # # # # pseudo_semi_category=this_category pseudo_semi_mathness=this_mathness pseudo_semi_translation=this_translation # Here is where things get a bit more interesting; we have to consider all the other (non-reserved-word) tokens, and find a way to convert them to \.{WEAVE} and \.{TANGLE}'s internal form. We take single characters straight, except for those that must be escaped one way or another. For multicharacter tokens, we have to invent a name and a number, which process we will describe below. Tokens have a zillion attributes: not just category, translation, and their friends, but things like internal representations, the length of the input string, you name it. We remember the length of the longest token in the system, because when we go to recognize tokens we will look for the longest first and then on down. We maintain that length at the very end here. #= this_string=$2 # ## print NF, "fields on line", NR-1 ## caught a bug in DEC awk $2 = this_string ## print NF, "fields on line", NR-1 # if (tokens[$2]!="") { print "Warning: token", $2, "defined twice" } tokens[$2]=1 ## remember this token # # # # This code represents and undocumented feature. We should replace it by allowing restricted translations in |$2|, the then documenting it. When doing this, we'll have to match the full |trans_pattern| in all its glory; A mere |#=/<.*>/#>| won't do. #= old_string = this_string this_string = "" ## Invariant: |this_string old_string| corresponds to result, and ## |"{space}"| is translated in |this_string| but not |old_string| idx = index(old_string,"{space}") while (idx != 0) { temp =substr(old_string,1,idx-1) this_string = this_string temp " " old_string = substr(old_string,idx+7) idx = index(old_string,"{space}") } this_string = this_string old_string # Tokens need an internal eight-bit representation. For single characters (which are assumed to be printable), we use the ASCII code as the internal representation. Multicharacter tokens will be assigned a name and a number. (The names may be specified by the user or generated by \.{SPIDER}.) Unfortunately the numbers for \.{WEAVE} and \.{TANGLE} have to be different (the reasons will only depress you). We assign \.{WEAVE} numbers by starting numbering from |highesttoken|, and working our way down. At the moment |hisghesttoken==200|, and I can't remember whether 200 is a ``magic number'' or not, so you'd better assume that it is. We get the tpoken numbers for \.{TANGLE} by subtracting an offset, as you'll see later. #= highesttoken=200 ## highest numbered token tokennumber=highesttoken # At the end we check to make sure we haven't used up too many numbers for tokens. \.{WEAVE} token numbers must be |>=127|. #= if (tokennumber<127) { print "Error: too many token names for WEAVE --- over by",\ 127-tokennumber exitcode=-1 } # \.{TANGLE} tokens must be between #'13 and #'37 inclusive. We add three to the number because \.{TANGLE} has special definitions for the three tokens taken off the top. #= if (highesttoken-tokennumber > #'37-(#'13-1)+3) { \ ## number of tokens in |#'13|--|#'37|, plus 3 print "Error: too many token names for TANGLE --- over by",\ highesttoken-tokennumber - (#'37-(#'13-1)+3) exitcode=-1 } # The token name is what \.{WEAVE} and \.{TANGLE} will use internally to refer to the token's internal representation as an eight-bit code. We use names instead of using the numbers directly in the vague hope that it will make \.{WEAVE} and \.{TANGLE} easier to debug when something goes wrong. For multi-character tokens, the name will be a \.{WEB} macro that is defined to be equal to the token's eight-bit code. If the token is a single character, its ``name'' will be that character, quoted with single quotes. The single-character tokens \.{@}, \.{\\}, and \.{'} require special handling, since they have to be escaped in some way to be quoted. Once we've computed the name, we put it in |tokenname[$2]|. #= if ($2=="@") { $2="@@" tokenname[$2]="'@@'" tokenlength[$2]=1 } else if ($2=="'" || $2 == "\\") { $2="\\" $2 tokenname[$2]="'" $2 "'" tokenlength[$2]=1 } else if (length($2)>1) { # } else { temp = sprintf("'%s'", $2) tokenname[$2] = temp tokenlength[$2]=1 } # For the long tokens, we generate a name by which we'll refer to the token. That name will actually be defined to be a number, which we'll take to be the current value of |tokennumber|. We'll write in |tokentest[$2]| the C~code used to recognize that token, and in |tokenlength[$2]| we'll leave that token's length. (The length is used both to find long tokens before short ones, and to avoid finding long ``tokens'' that actually go beyond the end of the line.) #= tokenname[$2]="SP_gen_token_" tokennumber tokennumbers[$2]=tokennumber tokennumber-- ## figure out how to recognize the token temp = sprintf( "strncmp(\"%s\",loc-1,%d)==0", $2, length($2)) tokentest[$2]=temp tokenlength[$2]=length($2) # The setting of attributes is as for all tokens: #= # this_name=tokenname[$2] start_place=3 # # tokencategory[$2]=this_category tokenmathness[$2]=this_mathness tokentranslation[$2]=this_translation tokenname[$2]=this_name tokentangleto[$2]=this_tangleto # We have to remember the length of the longest token so we can recognize long tokens before short ones. #= temp = tokenlength[$2] if (temp > maxtokenlength) { maxtokenlength=temp } # We're paranoid. #= if (tokenlength[$2]>1 && tokennumbers[$2]=="") { print "This can't happen: token", $2, "is long", \ "but has no number" exitcode = -1 } #*1Writing {\tt WEB}'s lexical analysis code. The token recognition problem is the same for \.{WEAVE} and \.{TANGLE}. Both have routines called |get_next| that recognize the tokens on input. Most of |get_next| is prefabricated (and the same in both \.{WEAVE} and \.{TANGLE}), but we have to put in the part that recognizes multi-character non-alphanumeric tokens. We write the same code to both \.{WEAVE} and \.{TANGLE}. #= tempfile = scrapfile # tempfile = ttokfile # # This is how we do it. #= print "@ Here we input tokens of more than one character" > tempfile print "@=" > tempfile # # We look for long tokens, then shorter, and so on. We have to make sure we don't look beyond the end of a line. #= for (len=maxtokenlength; len>=2; len--) { printf "if (loc+%d<=limit) {\n", len-1 > tempfile # printf "\t}\n" > tempfile } # # #= notfirst=0 for (t in tokentest) { if (tokenlength[t]==len) { printf "\t" > tempfile if (notfirst==1) { printf "else " > tempfile } notfirst=1 printf "if (%s) {\n", tokentest[t] > tempfile printf "\t\tloc += %d;\n", len-1 > tempfile printf "\t\treturn %s;\n\t\t}\n", tokenname[t] > tempfile } } # #= for (t in tokentest) { if (tokenlength[t]==1) { print "This can't happen: token", t, "is of length 1 but", \ "it has a test" exitcode=-1 } } #*1Writing out {\tt WEAVE}'s token-to-scrap code. Here is where we write the code that converts an already-recognized token (from |get_next|) into a scrap. There are several different kinds of tokens, and each requires a slightly different treatment. Will write out the code for the different species one at a time. #= print "Writing out predefined scraps" > logfile # # # # print "Writing out token scraps" > logfile # # This is how we write out the information for the identifier. #= if (id_category != "") { print "@ @=" > scrapfile print "p=id_lookup(id_first, id_loc,normal);" > scrapfile print "if (p->ilk==normal) {" > scrapfile print " small_app(id_flag+p-name_dir);" > scrapfile printf " app_scrap(SP_%s,%s_math);", \ id_category, id_mathness > scrapfile appended[id_category]=1 print " /* not a reserved word */" > scrapfile print "}" > scrapfile print "else if reserved(p) {" > scrapfile print "@;" > scrapfile print "}" > scrapfile print "else {" > scrapfile print " err_print(\"! Identifier with unmentioned ilk\");" > scrapfile print "@.Identifier with unmentioned ilk@>" > scrapfile print "}" > scrapfile } else { print "Error: I don't know what to do with an identifier" print " Please give me a \"token identifier ...\"" exitcode = -1 } # We hold the name |"identifier"|, and we reserve a number for identifiers. #= tokennumbers["identifier"]=tokennumber; tokennumber-- tokenname["identifier"]="identifier" # This is how we write out the string or constant scrap, at the end. #= print "Warning: TeX strings have the same category as ", \ "numeric constants in this WEAVE." print "Warning: TeX strings have the same category as ", \ "numeric constants in this WEAVE." > logfile if (number_category != "") { print "@ For some reason strings, constants,",\ " and \TeX\ strings are identified." > scrapfile print "That has to be fixed." > scrapfile print "@=" > scrapfile printf "app_scrap(SP_%s,%s_math);\n", number_category,\ number_mathness > scrapfile appended[number_category]=1 } else { print "Error: I don't know what to do with a numeric constant" print " Please give me a \"token number ...\"" exitcode = -1 } # We hold names and numbers for constants and strings, as well as identifiers. #= tokennumbers["constant"]=tokennumber; tokennumber-- tokenname["constant"]="constant" tokennumbers["string"]=tokennumber; tokennumber-- tokenname["string"]="string" # #= if (newline_category != "") { print "@ @=" > scrapfile transstring=newline_translation selfstring="small_app(next_control);" wherestring="in translation of token newline" append_keyword="small_app" #"|#> print outstring > scrapfile printf " app_scrap(SP_%s,%s_math);\n", newline_category,\ newline_mathness > scrapfile appended[newline_category]=1 } else { print "Error: I don't know what to do with a newline" print " Please give me a \"token newline ...\"" exitcode = -1 } # #= if (pseudo_semi_category != "") { print "@ @=" > scrapfile transstring=pseudo_semi_translation selfstring="small_app(next_control);" wherestring="in translation of token pseudo_semi" append_keyword="small_app" #"|#> print outstring > scrapfile printf " app_scrap(SP_%s,%s_math);\n", pseudo_semi_category,\ pseudo_semi_mathness > scrapfile appended[pseudo_semi_category]=1 } else { printf "Error: I don't know what to do with a pseudo_semi (%s;)",\ substr(at_sign,1,1) print " Please give me a \"token pseudo_semi ...\"" exitcode = -1 } # Here is how we write out the code that converts ordinary tokens to scraps: #= print "@ @=" > scrapfile for (t in tokens) { temp = tokenname[t] printf "case %s:\n", temp > scrapfile transstring=tokentranslation[t] selfstring="small_app(next_control);" wherestring= sprintf ("in translation of token %s", t) append_keyword="small_app" #"|#> print outstring > scrapfile printf "\tapp_scrap(SP_%s,%s_math);\n", tokencategory[t], \ tokenmathness[t] > scrapfile temp = tokencategory[t] appended[temp]=1 #^append check#> print "\tbreak;" > scrapfile } #*3{\tt TANGLE}'s token-to-output conversion. We have to write special cases for things appearing in |tokennumbers|. The output conventions for |string|, |constant| and |identifier| are fixed by \.{TANGLE}. One day we have to improve \.{TANGLE}'s treatment of spacing in the output; at the moment it just makes sure there are spaces between adjacent identifiers or numbers. #^future enhancements#> #= print "@ @=" > ttokfile for (t in tokennumbers) { # printf "case %s:\n", tokenname[t] > ttokfile this_tangleto = tokentangleto[t] if (this_tangleto=="") { printf "\tC_printf(\"%%s\",\"%s\");\n",t > ttokfile } else { printf "\tif (out_state==verbatim) {\n" > ttokfile printf "\t\tC_printf(\"%%s\",\"%s\");\n",t > ttokfile printf "\t} else {\n" > ttokfile # printf "\t}\n" > ttokfile } print "\tif (out_state!=verbatim) out_state=misc;" > ttokfile print "break;" > ttokfile } # We also have to write something for the tokens that aren't in |tokennumbers| but which have a nonnull |tokentangleto| anyway. #= print "@ @=" > ttokfile for (t in tokentangleto) { # if (tokennumbers[t]!="" || tokentangleto[t]=="") continue if (t=="@") { thistangletokname = "@@" } else if (t=="\\" || t=="'") { thistangletokname = "\\" t } else { thistangletokname = t } printf "case '%s':\n", thistangletokname > ttokfile this_tangleto = tokentangleto[t] if (this_tangleto=="") { print "This can't happen -- null tangleto for", t, wherestring exitcode = -1 } else { printf "\tif (out_state==verbatim) {\n" > ttokfile printf "\t\tC_printf(\"%%s\",\"%s\");\n",t > ttokfile printf "\t} else {\n" > ttokfile # printf "\t}\n" > ttokfile } print "\tif (out_state!=verbatim) out_state=misc;" > ttokfile print "break;" > ttokfile } # The tokens for |string|, |constant|, and |identifier| are treated specially by \.{TANGLE}; code to handle them already lives in \.{TANGLE}.web. Therefore, we don't gum up the works with our scheming. #= if (t=="string"||t=="constant"||t=="identifier") continue # This is somewhat like the translation code, but tuned for \.{TANGLE} #= oldwherestring = wherestring wherestring = "for tangleto " wherestring #@ transstring=this_tangleto # printf "\tC_printf(\"%%s\",%s);\n",outstring > ttokfile #@ wherestring=oldwherestring #*3Defining the token names. At some point we'll have to define all these names, for both \.{TANGLE} and \.{WEAVE}. We may as well show how we do that now. #= tempfile = scrapfile # tempfile = ttokfile # # We use an ugly trick to get the token numbers different for \.{WEAVE} and \.{TANGLE}: #= print "@ Here are the definitions of the token names" > tempfile for (t in tokennumbers) { temp = tokennumbers[t] if (temp==0) continue ## don't know why we need this!! if (tempfile==ttokfile) { ## output to \.{TANGLE} # ## already defined in \.{TANGLE} temp = temp + #'37 + 3 - highesttoken ## hackety hack! ## +3 because three highest are already defined! } printf "@d %s = %s\n", tokenname[t], temp > tempfile } # Some token names are just characters quoted with |'|. We write out all the others. #= for (t in tokenname) { temp = tokenname[t] if (substr(temp,1,1) != "'") { # print temp > tokennamefile } } # #= tempa=substr(temp,1,3) if (tempa=="SP_") { temp = substr(temp,4) ## remove |"SP_"| } #*Reserved words and ilks. \.{TANGLE} doesn't even need the {\it idea} of reserved words; it treats them like all other identifiers. \.{WEAVE}, however, needs to be able to recognize reserved words to do prettyprinting. \.{WEAVE} uses a two-tiered system for coping with reserved words. I think this system was really designed to make it easier to code \.{WEAVE} by hand, and is therefore not of much interest for \.{SPIDER}, but we retain it as a matter of least resistance. Every reserved word belongs to an ilk, and it is the ilks that, like tokens, have translations, categories, and so on. I have made a bewildering array of defaults that is probably full of bugs. We use a special convention to initialize the |this_| family. #= #=/^ilk /#> { print "Ilk", $2 > logfile # # this_name="" start_place=3 # # # ilk_category[$2]=this_category ilk_mathness[$2]=this_mathness ilk_translation[$2]=this_translation next } # The pernicious option here is to be able to leave off the category, so that an item of ilk |fish_like| will get category |fish|. #= if ($2 ~ #=/^[a-zA-Z_]+_like$/#> && $0 !~ #=/ category /#>) { ## give default category this_category = substr($2,1,length($2)-5) categories[this_category]=1 } # For the reserved words, our only option is to set an ilk. We go through wild and assuredly ill-advised gyrations attempting to set all the default properties of that ilk. If the ilk is omitted, we make a new ilk by attaching the string |"_like"| to the name of the reserved word. {\bf Don't use this feature; it embarrasses the author.} #^ill-advised#> #= #=/^reserved /#> { print "Reserved word", $2 > logfile if ($0 !~ #=/ ilk /#>) { # } for (i=3; i<=NF;) { if ($i == "ilk") { i++ reservedilk[$2]=$i has_reserved[$i]=1 ## remember that ilk has some reserved word i++ } else { print "Error: bad reserved word attribute:", $i, \ "on line", NR-1 # } } # next } # Here is our feeble attempt to make up an ilk for a reserved word for which no ilk is given. The default ilk for |"with"| is |"with_like"|, and so on. {\bf Please, please don't do this.} #= temp = $2 "_like" reservedilk[$2]=temp if (ilk_translation[temp]=="") { ilk_translation[temp]=default_translation } has_reserved[temp]=1 if (ilk_mathness[temp]=="") { ilk_mathness[temp]=default_mathness } ## and default category for that ilk is the resword itself if (ilk_category[temp]=="") { ilk_category[temp]=$2 categories[$2]=1 } ilk_is_made_up[temp]=1 ## we really should do something with this #^mistakes#> #*1Telling {\tt WEAVE} how to recognize reserved words. At the end, we'll write out definitions for the ilk names, and we'll write translations of all the ilks. #= print "Writing out reserved words and ilks" > logfile ilkno=64 print "@ Here is a list of all the ilks" > reserved for (i in ilk_translation) { printf "@d SP_%s = %d\n", i, ilkno > reserved ilkno++ } # Here is where we write the code that converts reserved word tokens into scraps. #= print " " > reserved print "@ Here are the scraps we get from the reserved words" > reserved print "@d the_word = res_flag+p-name_dir" > reserved print "@=" > reserved print "switch (p->ilk) {" > reserved for (t in ilk_translation) { printf "\tcase SP_%s: \n\t\t", t > reserved transstring=ilk_translation[t] selfstring="small_app(the_word);" wherestring= sprintf ("in translation of ilk %s", t) append_keyword="small_app" #"|#> if (trcnt>0) ## at least one text in the translation has_translation[t]=1 print outstring > reserved printf "\tapp_scrap(SP_%s,%s_math);\n", ilk_category[t], \ ilk_mathness[t] > reserved temp=ilk_category[t] appended[temp]=1 #^append check#> printf "\t\tbreak;\n" > reserved } print "}" > reserved # At the end, we'll have to enter each reserved word in the identifier table, along with its ilk. #= print "@ @=" > reserved for (i in reservedilk) { printf "id_lookup(\"%s\",NULL,SP_%s);\n", i, reservedilk[i] > reserved } # At the very end, we'll make sure every ilk has both a reserved word and some translation. {\bf Perhaps this could be cleaned up a bit?} #= for (i in ilk_translation) { if (has_reserved[i] != 1) { print "Error: there is no reserved word of ilk", i exitcode=-1 } if (has_translation[i] != 1) { print "Error: ilk", i, "has no translation" exitcode=-1 } } # #= for (i in ilk_translation) { print i > ilkfile } # #= for (i in ilk_translation) number_of_ilks++ for (i in reservedilk) number_of_reserved_words++ printf "You defined %d reserved words of %d ilks.\n", \ number_of_reserved_words, number_of_ilks printf "You defined %d reserved words of %d ilks.\n", \ number_of_reserved_words, number_of_ilks > logfile #*The prettyprinting grammar. The most intricate part of \.{WEAVE} is its mechanism for converting programming language code into \TeX\ code. A ``bottom up'' approach is used to parse the programming language material, since \.{WEAVE} must deal with fragmentary constructions whose overall ``part of speech'' is not known. At the lowest level, the input is represented as a sequence of entities that we shall call {\it scraps}, where each scrap of information consists of two parts, its {\it category} and its {\it translation}. The category is essentially a syntactic class, and the translation is a token list that represents \TeX\ code. Rules of syntax and semantics tell us how to combine adjacent scraps into larger ones, and if we are lucky an entire program text that starts out as hundreds of small scraps will join together into one gigantic scrap whose translation is the desired \TeX\ code. If we are unlucky, we will be left with several scraps that don't combine; their translations will simply be output, one by one. The combination rules are given as context-sensitive productions that are applied from left to right. Suppose that we are currently working on the sequence of scraps $s_1\,s_2\ldots s_n$. We try first to find the longest production that applies to an initial substring $s_1\,s_2\ldots\,$; but if no such productions exist, we find to find the longest production applicable to the next substring $s_2\,s_3\ldots\,$; and if that fails, we try to match $s_3\,s_4\ldots\,$, etc. A production applies if the category codes have a given pattern. For example, one of the productions is $$\hbox{\.{open [ math semi <\.{"\\\\,"}-opt-5> ] --> open math}}$$ and it means that three consecutive scraps whose respective categories are |open|, |math|, and |semi| are con\-verted to two scraps whose categories are |open| and |math|. The |open| scrap has not changed, while the string \.{<"\\\\,"-opt-5>} indicates that the new |math| scrap has a translation composed of the translation of the original |math| scrap followed by the translation of the |semi| scrap followed by `\.{\\,}' followed by `|opt|' followed by `\.5'. (In the \TeX\ file, this will specify an additional thin space after the semicolon, followed by an optional line break with penalty 50.) Their is an extensive discussion of the grammar, with examples, in the ``Spider User's Guide.'' Y'oughta read it. #*1Scanning a production. A production in the grammar is written as a sequence of category names and translations, followed by a right arrow (\.{-->}), followed by a category name. When \.{WEAVE} is scanning the sequence of scraps that makes up a module, it checks to see whether the categories of those scraps match the categories given on the left side of the production. If so, the production fires, and the scraps and translations on the left side of the arrow are combined into a single, new scrap, and the new scrap is given the category from the right side of the arrow. The scraps which are combined are called the firing scraps, #^firing scraps#> and the category given to the combination is called the target category. Instead of a category name, e.g.~``\.{math},'' one can write a list of category names, e.g.~``\.{(open\vert lsquare)}'' instead. A scrap matches the list if and only if its category is one of the names listed. One can also use the wildcard ``\.?'', which any scrap matches. On the right-hand side, one can write a \## followed by a number in place of the target category name. If we specify the target category as ``\.{\##2}'', for example, it means ``give the new scrap the same category as the second scrap that matched the left side of the production.'' # Here is the whole syntax as quoted from the ``Spider User's Guide'' \begingroup\def\\{\par\noindent\ignorespaces}\tt \noindent\syntax{production} \produces\\\quad \syntax{left context} [ \syntax{firing instructions} ] \syntax{right context} --> \syntax{left context} \syntax{target category} \syntax{right context}\\ \syntax{left context} \produces~\syntax{scrap designators}\\ \syntax{right context} \produces~\syntax{scrap designators}\\ \syntax{firing instruction} \produces \syntax{scrap designator}\\ \syntax{firing instruction} \produces \syntax{translation}\\ \syntax{scrap designator} \produces~?\\ \syntax{scrap designator} \produces~\opt{!}\syntax{marked category}\\ \syntax{scrap designator} \produces~\opt{!}\syntax{category alternatives}\\ \syntax{category alternatives} \produces~(\syntax{optional alternatives}\syntax{marked category})\\ \syntax{optional alternative} \produces~\syntax{marked category}\vert\\ \syntax{marked category} \produces~\syntax{category name}\opt{*}\\ \syntax{target category} \produces~\#\syntax{integer}\\ \syntax{target category} \produces~\syntax{category name}\\ \endgroup # Here is the pattern that reads productions. In most of the modules below, we read through some of the fields of the production. We use |i| to remember what field we are about to examine. When a module terminates, |$i| is left pointing to the first field of interest to the next module. #= #=/-->/#> { # # # # # # # # # # # # # next } ## \.{/-->/} # Each scrap in the production will be given a position |pos|, beginning with 1. (Using 1 and not 0 lets us make good use of the fact that uninitialized AWK variables will have value zero.) We will remember the positions of the scraps that get reduced; they will be from |lowpos| to |highpos-1|. We keep track of the production number in |prodnum|, and we save a copy of the input line in |inputline[prodnum]|. #= lowpos=0; highpos=0; pos=1 prodnum=prodnum+1 inputline[prodnum]=$0 print "Parsing production", prodnum, $0 > logfile # This is the guts of the parsing. We have to read each field in the production, determine whether it is category or translation information, and act accordingly. Each scrap will be given a position |pos|. We will write in |test[pos]| the code needed to decide whether a particular scrap matches the pattern given in the production. Scraps can match a single category by name, a list of categories, or |"?"|, which every scrap matches. Categories can be starred, in which case we underline the index entry of the first identifier in the scrap's translation. We also write in |trans[pos]| the code necessary to produce the translations preceding the scrap at |pos|. #= trans[pos]="" for (i=1; i<=NF; i++) { if ($i ~ #=/<.*>/#>) { ## should be |trans_pattern| # } else if ($i ~ #=/^!?[a-zA-Z_]+(\*\*?)?$/#>) { ## |cat_pattern| # } else if ($i ~ #=/^!?$([a-zA-Z_]+\|)*[a-zA-Z_]+$(\*\*?)?$/#>){ # } else if ($i == "?") { # } else if ($i == "[") { lowpos=pos } else if ($i == "]") { highpos=pos } else if ($i=="-->") { break } else { ## we don't recognize the field print "Error: bad field is", $i, "in production on line", NR-1 # } } i++ # When we find a mistake, we just abandon the current production. Decrementing |prodnum| will make it as if this production never happened. #= prodnum-- # # We process the translation and add the result to the current translation for |pos|. #= transstring=$i selfstring="" ## senseless for productions wherestring= sprintf ("in production on line %d", NR-1) append_keyword="app" #"|#> trans[pos]=trans[pos] outstring # Here we'll set |test[pos]|. The phrase |test[pos]| will be a single C conjunct; if the test for each scrap is true, the whole production will fire. If we're called upon to make a scrap underlined or reserved, we'll add to |trans[pos]|. If a category is negated we add an extra clause to make sure nothing matches the zero category, since {\tt WEAVE} assumes no production ever matches a scrap with category zero. #= field[pos]=$i ## save this field to compare RHS # # cat = $i categories[cat]=1 ## remember |cat| is a category if (negation==0) { test[pos]=sprintf("(pp+%d)->cat==SP_%s",pos-1,cat) } else { test[pos]=sprintf("((pp+%d)->cat!=SP_%s && (pp+%d)->cat != 0)",\ pos-1,cat,pos-1) } # # # The list of categories is enclosed in parentheses and the individual categories are separated by vertical bars. We have to make the test for these things a disjunction, but processing is more or less like the processing for a single category. If a list of alternatives is negated we add an extra clause to make sure nothing matches the zero category, since {\tt WEAVE} assumes no production ever matches a scrap with category zero. #= field[pos]=$i ## save this field to compare RHS # if (negation==0) { test[pos]="(" ## open for a list of good alternatives } else { temp=sprintf("(pp+%d)->cat==0",pos-1) test[pos]="!(" temp "||" ## open for a list of bad alternatives } # temp = substr($i,2,length($i)-2) ## throw out parens m = split(temp,tok,"|") for (j=1;j<=m;j++) { cat = tok[j] categories[cat]=1 ## remember it's a category # temp=sprintf("(pp+%d)->cat==SP_%s",pos-1,cat) test[pos]=test[pos] temp ## add alternative to test if (j!=m) test[pos]=test[pos] "||\n" ## line too long errors } test[pos]= test[pos] ")" # # We keep track of the rightmost occurrence of each category. This enables us to backtrack by exactly the right amount when a production fires and creates a new scrap. #= if (pos > highestpos[cat]) { highestpos[cat]=pos } # If a category or lsit of alternatives is preceded by an exclamation point (|"!"|), we set |negation|, and we will test for scraps that are {\it not} of that category or are {\it not} of one of the categories listed. #= temp = substr($i,1,1) if (temp=="!") { negation = 1 $i = substr($i,2) } else { negation = 0 } # Since both translations and tokens can add to |trans[pos]| we must make sure it is empty whenever we get a new |pos|. This device makes that easy. #= pos=pos+1 trans[pos]="" # If a category is single-starred, we take this construct to be the {\it definition} of that item, and we underline the index entry for this module. The |make_underlined| routine finds the first identifier in the translation of the starred scrap, and underlines the index entry for that identifier in this module. If a category is double-starred, we used to try to change the ilk of the appropriate identifier to make it a reserved word. The only use this ever had was in handling C typedefs, and it should probably be removed. #^mistakes#> In the meanwhile, double starring is like single starring. #= if ($i ~ #=/^([a-zA-Z_]+|$([a-zA-Z_]+\|)*[a-zA-Z_]+$)\*\*$/#>) { ## it's double-starred temp = sprintf("\tmake_underlined(pp+%d);\n",pos-1) trans[pos] = trans[pos] temp $i = substr($i,1,length($i)-2) } else if ($i ~ #=/^([a-zA-Z_]+|$([a-zA-Z_]+\|)*[a-zA-Z_]+$)\*$/#>) { ## it's starred temp = sprintf("\tmake_underlined(pp+%d);\n",pos-1) trans[pos] = trans[pos] temp $i = substr($i,1,length($i)-1) } else if ($i ~ #=/\*$/#>) { ## a bad star? print "Error: can't remove stars in production on line", NR-1 # } # Wild cards are easy to process, but we do have to remember that not even a wild card matches a scrap of category zero. #= field[pos]=$i ## save this field to compare RHS test[pos]=sprintf("(pp+%d)->cat!=0",pos-1) ## anything nonzero matches highwildcard=pos ## we don't really need this? # # We reach this point in the program after we will have read the arrow into |$i|. This module establishes in what ranges of |pos| the contexts fall: $$\vbox{\halign{##\hfil\tabskip1em&\hfil##\hfil\cr \bf Items&\bf Range\cr \noalign{\vskip2pt} left context&|1..lowpos-1|\cr firing instructions&|lowpos..highpos-1|\cr right context&|highpos..arrowpos-1|\cr }}$$ If |lowpos| and |highpos| haven't been set by the appearance of square brackets, we set them to make the contexts empty. None or both should be set. #= arrowpos=pos if (lowpos==0 && highpos==0) { lowpos=1 ## first transform position highpos=arrowpos ## first token not reduced ## (or one beyond last token position) } else if (lowpos==0 || highpos==0) { print "Error: square brackets don't balance in", \ "production on line", NR-1 # } # Here is the efficient place to update the rightmost (highest) position of {\it any} category. #= if (arrowpos-1 > highestposoverall) { highestposoverall=arrowpos-1 } # Dealing with grammars in which categories can be unnamed (using wildcards or negation) can be a pain in the ass. What we have to do, when reducing after firing a production, is move backwards enough so that we don't miss any earlier productions that are supposed to fire. This means we have to move back at least far enough so that the new scrap will match any unnamed category. {\bf But} we don't have to worry about wildcards (|"?"|) at the end of a production, because they would have matched anyway, even before the current production fired. Hence: #= for (hup=arrowpos-1; field[hup]=="?";) { hup-- } for (;hup>highestunknownpos;hup--) { temp=field[hup] temp=substr(temp,1,1) if (temp=="?" || temp =="!") { highestunknownpos=hup ## we know |hup>highestunknownpos| break ## redundant, since test will fail } } # Here is the error checking for context sensitive productions. #= for (pos=1;pos i++ } # #= for (pos=highpos;pos i++ } # #= if (i>NF || $i != field[pos]) { print "Error: token mismatch is: found", $i, \ "sought", field[pos], "on line", NR-1 # } # We process our target scrap in between checking the left and right contexts. This scrap can be the name of a category, or it can be ``$\##nnn$'', where $nnn$ refers to the number of a category on the left side of the arrow. In this way it is possible to match wildcards and lists of alternatives. #= ## i points to the target category if (i>NF) { print "Error: no target category in production on line", NR-1 # } if ($i ~ #=/##[0-9]+/#>) { ## a number $i = substr($i,2) ## peel off the \## # targetcategory[prodnum]="Unnamed category" temp = sprintf("(pp+%d)->cat", $i-1) unnamed_cat[prodnum]=temp } else if ($i ~ #=/[a-zA-Z][a-zA-Z_]*/#>) { ## a category targetcategory[prodnum]=$i categories[$i]=1 ## remember this is a category } else { print "Error: unrecognizable target token", $i, \ "in production on line", NR-1 # } i++ # We call this at the end to make sure there aren't unused fields left over #= if (i<=NF) { print "Error: used only " i-1 " of " NF " tokens", \ "in production on line", NR-1 # } # After having vetted the whole production, we combine the tests and translations for each |pos|. #= prodtest[prodnum]="" for (pos=1;pos1) { prodtest[prodnum]=prodtest[prodnum] " &&\n\t\t" } prodtest[prodnum]=prodtest[prodnum] test[pos] } # #= prodtrans[prodnum]="" for (pos=lowpos;pos } prodtrans[prodnum]=prodtrans[prodnum] trans[highpos] # #= ppstart[prodnum]=lowpos-1 tokensreduced[prodnum]=highpos-lowpos # #= if (highpos-lowpos==1) { printf "%d: %s --> %s\n", prodnum, field[lowpos], \ targetcategory[prodnum] > cycles wrotecycles = 1 } # If we never even had the possibility of a cycle, we still have to write out a dummy file so the cycle checker in the Makefile won't barf. # #= if(wrotecycles==0) { print "0: dummy --> nodummy" > cycles } # For error checking, we keep track of categories that get reduced in productions. We can't do this while scanning the production, because we don't know at the beginning what |lowpos| will be, since we might or might not ever see a left square bracket. If a particular category is never reduced, that merits a warning later on. #= temp = field[pos] tempa = substr(temp,1,1) if (tempa != "!") { if (temp ~ #=/^$([a-zA-Z_]+\|)*[a-zA-Z_]+$(\*\*?)?$/#>) { ## list of alternatives # temp = substr(temp,2,length(temp)-2) m = split(temp,tok,"|") for (j=1;j<=m;j++) { alternate = tok[j] reduced[alternate]=1 } } else if (temp ~ #=/^[a-zA-Z_]+(\*\*?)?$/#>) { # reduced[temp]=1 } else if (temp != "?") { print "Confusion: unintelligible field[pos]:", temp, \ "in production on line", NR-1 # } } # #= while (temp ~ #=/\*$/#>) { temp = substr(temp,1,length(temp)-1) } # #= for (c in categories) { if (reduced[c] != 1) { print "Warning: category", c, "never reduced" } } # Here's a check for the target token number #= if ((0+$i)<1 || (0+$i)>=0+arrowpos) { print "Error: can't take token number", $i, "of", arrowpos-1, \ "tokens", "in production on line", NR-1 # } #*1Writing the scrap reduction code. Before writing the grammar, we want to define all of the category codes. #= print "Writing out category codes" > logfile print "@ Here is a list of category codes scraps can have" > grammarfile i=1 for (t in categories) { printf "@d SP_%s = %d\n",t,i > grammarfile i++ } print "@c" > grammarfile # We also want to make sure we can print the names of categories in case we need to debug. #= print "##ifdef DEBUG" > grammarfile print "##define PRINT_CAT(A,B) case A: printf(B); break" > grammarfile print "print_cat(c) /* symbolic printout of a category */" > grammarfile print "eight_bits c;" > grammarfile print "{" > grammarfile print " switch(c) {" > grammarfile for (t in categories) { printf "PRINT_CAT(SP_%s,\"%s\");\n",t,t > grammarfile } print " case 0: printf(\"zero\"); break;" > grammarfile print " default: printf(\"UNKNOWN\"); break;" > grammarfile print " }" > grammarfile print "}" > grammarfile print "##endif DEBUG" > grammarfile print " " > grammarfile # And there goes the list... #= for (c in categories) { print c > categoryfile } # #= for (c in categories) { number_of_categories++ } printf "You used %d different categories in %d productions.\n", \ number_of_categories, prodnum printf "You used %d different categories in %d productions.\n", \ number_of_categories, prodnum > logfile printf "The biggest production had %d scraps on its left-hand side.\n", \ highestposoverall printf "The biggest production had %d scraps on its left-hand side.\n", \ highestposoverall > logfile # We will write a list of the successfully parsed productions to a separate file. The list will include production numbers, to which the user can refer when debugging. #= for (n=1; n<= prodnum; n++) { printf "%2d: %s\n",n,inputline[n] > productions } # Finally, we write out the code for all of the productions. Here is our first view of category checking: we want to make sure that each category can be appended, either by |app_scrap| or by |reduce|. We also want to make sure each category can be reduced by firing some production. We track these things using the arrays |appended| and |reduced|. We write the definition of |highestposoverall|, for safety. We used to write this code as a very deeply nested if-then-else, but that caused a yacc overflow in the generated code for C~{\tt WEAVE}. So now we write {\tt if (...) \LB...; goto end\_prods;\RB} #= print "Writing out grammar" > logfile print "@ Here is where we define |highestposoverall| and where we" > grammarfile print "check the productions." > grammarfile print "@d highestposoverall =", highestposoverall > grammarfile print "@=" > grammarfile for (n=1; n<=prodnum; n++) { if (n%5==0) print "@ @=" \ > grammarfile ## avoids overflowing \.{WEAVE} of \.{WEAVE} # # printf "if (%s) {\n\t/* %d: {\\tt %s} */\n%s",\ prodtest[n],n,this_string,prodtrans[n] > grammarfile # print "\tgoto end_prods;" > grammarfile printf "} " > grammarfile } printf "\n" > grammarfile print "end_prods:" > grammarfile # We do different things for a category that is unnamed. #= ttk=targetcategory[n] if (ttk == "Unnamed category") { #^append check#> printf "\treduce(pp+%d,%d,%s,%d,%d);\n",ppstart[n],\ tokensreduced[n],unnamed_cat[n],\ 1-highestposoverall,n > grammarfile } else { appended[ttk]=1 ## remember we appended this token #^append check#> reduction=highestpos[ttk] if (reduction grammarfile } # This is the place we check for errors. #^append check#> #^reduce check#> #= for (c in categories) { if (appended[c] != 1) { if (c=="ignore_scrap") { ## appended by \.{WEAVE} print "Warning: category", c, "never appended" } else { print "Error: category", c, "never appended" exitcode=-1 } } } # It's desirable to put the production in a comment, but we have to get rid of the confusing \vert, or \.{WEAVE} will think it introduces code. We also have to escape underscores and sharp signs, otherwise \TeX\ will think we want math mode. #= this_string = inputline[n] tempi = index(this_string,"|") while (tempi != 0) { tempa = substr(this_string,1,tempi-1) tempb = substr(this_string,tempi+1) this_string = tempa "\\vert " tempb tempi = index(this_string,"|") } templ = ""; tempr = this_string tempi = index(tempr,"_") while (tempi != 0) { tempa = substr(tempr,1,tempi-1) tempr = substr(tempr,tempi+1) templ = templ tempa "\\_" tempi = index(tempr,"_") } this_string = templ tempr templ = ""; tempr = this_string tempi = index(tempr,"##") while (tempi != 0) { tempa = substr(tempr,1,tempi-1) tempr = substr(tempr,tempi+1) templ = templ tempa "\\##" tempi = index(tempr,"##") } this_string = templ tempr # We have to keep these productions from making an input line too long. #= toolong=this_string; this_string="" while (length(toolong)>60) { idx=59 idchar = substr(toolong,idx,1) while (idx>1 && idchar!=" ") { idx-- idchar = substr(toolong,idx,1) } if (idx==1) idx=59 temp = substr(toolong,1,idx-1) toolong = substr(toolong,idx+1) this_string = this_string temp "\n" } this_string = this_string toolong #*The rest of {\tt SPIDER}. We present the remaining features of \.{SPIDER} in them order we used in the ``\.{SPIDER} User's Guide.'' #*2 Naming the target language. \.{SPIDER} is designed to help you build a \.{WEB} system for any programming language. We need to know the name of the language, and what extension to use when writing the tangled unnamed module. We use this information to pick a name for the file that will hold this \.{WEB}'s special \TeX{} macros, and we write |"\\input webkernel"| on that file. #= #=/^language /#> { language = $2 extension=language for (i=3; i } } # # next } # #= if (language != "") { print "@ Here is the language-dependent stuff" > tlang if (version!="") version = ", Version " version printf "@d banner = \"This is %s TANGLE%s %s\\n\"\n", language, \ version, date > tlang printf "@=char C_file_extension[]=\"%s\";\n", extension \ > tlang #@ print "@ Here is the language-dependent stuff" > wlang if (version!="") version = ", Version " version printf "@d banner = \"This is %s WEAVE%s %s\\n\"\n", language, \ version, date > wlang print "@=" \ > wlang printf "*out_ptr='x'; tex_printf(\"\\\\input %sweb.te\");\n", \ extension > wlang printf "@ @=char C_file_extension[]=\"%s\";\n", extension \ > wlang } else { print "Error: you haven't given me any \"language\" information" exitcode=-1 } #*1Defining {\TeX} macros. The first thing we do after getting the language is write a line to the macro file. This makes sure the kernel \.{WEB} macros will be available. #= macrofile = extension "web.tex" print "\\input webkernel.tex" > macrofile # Processing macros is straightforward: everything between \.{macros begin} and \.{macros end} gets copied into the macro file. #= #=/^macros begin$/,/^macros end$/#> { if (begunmacs==0) { begunmacs=1 next } if ($0 ~ #=/^macros end$/#>) { begunmacs=0 next } if (macrofile=="") { if (complained==0) { print "Error: you must give \"language\"",\ "before \"macros\"" complained=1 # } } else { print $0 > macrofile } next } #*1Handling modules. We need to give module names a category, both when we define modules and when we use them in other modules. We might conceivably fool around with mathness, but we don't really intend to do so. #= #=/^module /#> { for (i=2;i logfile i++ } else if ($i=="use") { i++ mod_use_cat=$i categories[$i]=1 print "Module use category set to", $i > logfile i++ } else { print "Error: unknown module property", $i, \ "on line", NR-1 # } } # next } # Here's how we rig it: #= if (mod_def_cat!="") { print "@ @=" > scrapfile printf "app_scrap(SP_%s,no_math);\n", mod_def_cat > scrapfile appended[mod_def_cat]=1 } else { print "Error: I don't know what to do with a module definition" print " Give me a \"module definition ...\"" exitcode=-1 } if (mod_use_cat!="") { print "@ @=" > scrapfile printf "app_scrap(SP_%s,maybe_math);\n", mod_use_cat > scrapfile appended[mod_use_cat]=1 } else { print "Error: I don't know what to do with a module use" print " Give me a \"module use ...\"" exitcode=-1 } #*1At sign. With \.{SPIDER}, we can designate any character we like as the ``magic at sign.'' #= #=/^at_sign /#> { if (NF==2 && length($2)==1) { if ($2=="@") { at_sign="@@" } else { at_sign=$2 } } else { print "Error: I can't understand", $0 print " Give me an at sign of length 1" # } next } # We write the at sign out to the grammar file and to \.{TANGLE}'s token file #= tempfile = grammarfile # tempfile = ttokfile # # It's trivially done #= print "@ Here is the |at_sign| for the new web" > tempfile printf "@d at_sign = @`%s'\n", at_sign > tempfile print " " > tempfile print "@ Here is |the_at_sign| left for common" > tempfile print "@=char the_at_sign = at_sign;" > tempfile print " " > tempfile # We provide a default at sign: #= at_sign="@@" #*1Comments. We have to explain how our programming language supports comments. We give the strings that initiate and terminate a comment. We can say comments are terminated by ``newline'' if that's the case. #= #=/^comment /#> { print $0 > logfile for (i=2; i$/#>) { transstring = $i wherestring = "in \"comment begin\" on line " NR-1 # begin_comment_string = outstring i++ } else { print "Error: \"comment begin\" must have a restricted translation" # } } else if ($i=="end") { i++ if ($i=="newline") { comments_end_with_newline = 1 end_comment_string = "\"\\n\"" } else if ($i ~ #=/^<.*>$/#>){ comments_end_with_newline = 0 transstring = $i wherestring = "in \"comment end\" on line " NR-1 # end_comment_string = outstring } else { print "Error: \"comment end\" must have a restricted translation" # } i++ } else { print "Error: bad comment attribute:", $i # } } # # next } # \.{WEAVE} and \.{TANGLE} must be able to recognize comments. Here we give \.{TANGLE} quoted strings that show the beginning and end of a comment. #= print "@ Here we recognize the comment start seqence" > ttokfile print "@=" > ttokfile printf "{int len; len=strlen(%s);\n", begin_comment_string > ttokfile printf "if (loc+len<=limit && !strncmp(loc,%s,len)) {\n",\ begin_comment_string > ttokfile print "\tloc += len; /* a new thing */" > ttokfile print "\tskip_comment(); /* scan to end of comment or newline */" > ttokfile print "\tif (comment_continues || comments_end_with_newline)" > ttokfile print "\t\treturn('\\n');" > ttokfile print "\telse continue;\n}\n}" > ttokfile # Now this is \.{WEAVE} finding the start of a comment #= print "@ @=" \ > scrapfile printf "{int len; len=strlen(%s);\n", begin_comment_string > scrapfile printf "if (loc+len-1<=limit && !strncmp(loc-1,%s,len)) {\n",\ begin_comment_string > scrapfile print "\tloc += len-1;" > scrapfile print "\t return (begin_comment); /* scan to end of comment or newline */" > scrapfile print "}\n}" > scrapfile # Here \.{TANGLE} spots the end of a comment #= print "@ Here we deal with recognizing the end of comments" > ttokfile printf "@d comments_end_with_newline = %d\n", comments_end_with_newline >ttokfile print "@=" > ttokfile if (comments_end_with_newline != 1) { printf "{int len; len=strlen(%s);\n", end_comment_string > ttokfile printf "if (loc+len-1<=limit && !strncmp(loc-1,%s,len)) {\n",\ end_comment_string > ttokfile print "loc += len-1; return(comment_continues=0); }}" > ttokfile } else { print "/* This code will never be executed */ " > ttokfile } # Now here is \.{WEAVE}. \.{WEAVE} copes elsewhere with the situation when |comments_end_with_newline| holds, so we don't need to consider it here. #= printf "@ Here we recognize end of comments" > scrapfile printf "@d comments_end_with_newline = %d\n",comments_end_with_newline >scrapfile print "@=" > scrapfile printf "{int len; len=strlen(%s);\n", end_comment_string > scrapfile printf "if (loc+len-1<=limit && !strncmp(loc-1,%s,len)) {\n",\ end_comment_string > scrapfile print " loc++; if(bal==1) {if (phase==2) app_tok('}'); return(0);}" > scrapfile print " else {" > scrapfile print " err_print(\"! Braces don't balance in comment\");" > scrapfile print "@.Braces don't balance in comment@>" > scrapfile print " @;" > scrapfile print " }" > scrapfile print "}" > scrapfile print "}" > scrapfile # We have to give \.{TANGLE} the beginning and ending comment strings, so it can use thing in writing its own comments. #= print "@ Important tokens:" > ttokfile printf "@d begin_comment_string = %s\n", begin_comment_string > ttokfile printf "@d end_comment_string = %s\n", end_comment_string > ttokfile # We also have to write out the starting and ending comment strings to the macro file. We do this at the time of parsing |#=/^comment /#>|, so the user has a chance to override. #= if (macrofile!="") { this_string=substr(begin_comment_string,2,length(begin_comment_string)-2) # printf "\\def\\commentbegin{%s}\n", tex_string > macrofile if (comments_end_with_newline==0) { this_string=substr(end_comment_string,2,length(end_comment_string)-2) # printf "\\def\\commentend{%s}\n", tex_string > macrofile } else { print "\\def\\commentend{\\relax}" > macrofile } } else { print "Error: I can't write comment info to the macro file---" print " you haven't given me any \"language\" information" # } # Escaping \TeX's specials is pretty easy: #= texof["\\"]="\\BS" texof["{"]="\\{" texof["}"]="\\{" texof["$"]="\\$" texof["&"]="\\amp" texof["##"]="\\##" texof["^"]="\\H" texof["_"]="\\_" texof["~"]="\\TI" texof["%"]="\\%" # #= tex_string="" while (length(this_string)>0) { c = substr(this_string,1,1) this_string = substr(this_string,2) cprime = texof[c] if (cprime=="") { tex_string = tex_string c } else { tex_string = tex_string cprime } } #*1Controlling line numbering. Here we fart around with line numbering for \.{TANGLE}. This lets \.{TANGLE} write an indication of the locations of things in the \.{WEB} source. The C preprocessor accepts these things as \.{\##line} directives. #= #=/^line /#> { print $0 > logfile for (i=2; i$/#>) { transstring = $i wherestring = "in \"line begin\" on line " NR-1 # sharp_line_open = outstring i++ } else { print "Error: \"line begin\" must have a restricted translation" # } } else if ($i=="end") { i++ if ($i ~ #=/^<.*>$/#>){ transstring = $i wherestring = "in \"line end\" on line " NR-1 # sharp_line_close = outstring } else { print "Error: \"line end\" must have a restricted translation" # } i++ } else { print "Error: bad line attribute:", $i, "on line", NR-1 # } } ## |for| # next } # We have to give \.{TANGLE} the strings for \&{\##line} commands. #= print "@ Important tokens:" > ttokfile printf "@d sharp_line_open = %s\n", sharp_line_open > ttokfile printf "@d sharp_line_close = %s\n", sharp_line_close > ttokfile # We'll choose some innocuous defaults #= sharp_line_open = "\"##line\"" sharp_line_close = "\"\"" #*1Tracking the generation date. We want to be able to note the date on which we generate files. #= #=/^date /#> { ## date returned as ``Fri Dec 11 11:31:18 EST 1987'' mo = month[$3] day = $4 year = $7 time = $5 # date = sprintf ("(generated at %d:%s %s on %s %d, %d)",\ hour, minute, ampm, mo, day, year) next } # We want the months to have their full names #= month["Jan"]="January" month["Feb"]="February" month["Mar"]="March" month["Apr"]="April" month["May"]="May" month["Jun"]="June" month["Jul"]="July" month["Aug"]="August" month["Sep"]="September" month["Oct"]="October" month["Nov"]="November" month["Dec"]="December" # We make a ``friendly'' time from |time=="hh:mm:ss"|. #= hour = substr(time,1,2) if (hour >=12) ampm = "PM" else ampm="AM" if (hour==0) { hour =12 } else if (hour>12) { hour = hour -12 } minute = substr(time,4,2) #*=The {\tt SPIDER} tools. #i cycle.web #*Flagging duplicate names. Detects duplicate names in a sorted list. #(nodups.awk#>= { if ($0==last) { print "Error: duplicate name", $0, "on lines", NR-1"-"NR exit -1 } last = $0 } #*Checking translation keywords for validity. #(transcheck.awk#>= #=/^good translations$/#>,#=/^test translations$/#> { if ($0 !~ #=/^good translations$|^test translations$/#>) { istranslation[$0]=1 } next } { if (istranslation[$0]!=1) { print "Error:", $0, "is not a valid translation" exitcode = -1 } } END { exit exitcode } # This is a copy of {\tt transcheck.list}, which should be the first part of the input to {\tt transcheck.awk}. Since \.{TANGLE} will insert its own stuff, we can't use it. {\tt transcheck.awk} {\em could} be updated to work with the tangled output, though, if it seemed desirable. #(junk.list#>= good translations break_space force big_force opt backup big_cancel cancel indent outdent math_rel math_bin math_op test translations #*=Index. This is a combined index to {\tt SPIDER} and the {\tt SPIDER} tools. Since the {\tt SPIDER} tools are nearly trivial, it's really just {\tt SPIDER}.