1
Fork 0
mirror of https://git.savannah.gnu.org/git/guile.git synced 2025-05-20 11:40:18 +02:00

A first, rough lexer for elisp still missing some stuff.

* module/language/elisp/lexer.scm: New lexer file.
* test-suite/Makefile.am: Register elisp-reader.test as new test.
* test-suite/tests/elisp-reader.test: New test-case.
This commit is contained in:
Daniel Kraft 2009-08-26 14:32:48 +02:00
parent 157ffbd797
commit 25512a940b
3 changed files with 250 additions and 0 deletions

View file

@ -0,0 +1,162 @@
;;; Guile Emac Lisp
;; Copyright (C) 2001 Free Software Foundation, Inc.
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING. If not, write to
;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.
;;; Code:
(define-module (language elisp lexer)
#:use-module (ice-9 regex)
#:export (get-lexer))
; This is the lexical analyzer for the elisp reader. It is hand-written
; instead of using some generator because I think that's most viable in this
; case and easy enough.
; Read a symbol or number from a port until something follows that marks the
; start of a new token (like whitespace or parentheses). The data read is
; returned as a string for further conversion to the correct type, but we also
; return what this is (integer/float/symbol).
; If any escaped character is found, it must be a symbol. Otherwise we
; at the end check the result-string against regular expressions to determine
; if it is possibly an integer or a float.
(define integer-regex (make-regexp "^[+-]?[0-9]+\\.?$"))
(define float-regex
(make-regexp "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$"))
; A dot is also allowed literally, only a single dort alone is parsed as the
; 'dot' terminal for dotted lists.
(define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?."))
(define (get-symbol-or-number port)
(let iterate ((result-chars '())
(had-escape #f))
(let* ((c (read-char port))
(finish (lambda ()
(let ((result (list->string (reverse result-chars))))
(values
(cond
((and (not had-escape)
(regexp-exec integer-regex result))
'integer)
((and (not had-escape)
(regexp-exec float-regex result))
'float)
(else 'symbol))
result))))
(need-no-escape? (lambda (c)
(or (char-numeric? c)
(char-alphabetic? c)
(char-set-contains? no-escape-punctuation
c)))))
(cond
((eof-object? c) (finish))
((need-no-escape? c) (iterate (cons c result-chars) had-escape))
((char=? c #\\) (iterate (cons (read-char port) result-chars) #t))
(else
(unread-char c port)
(finish))))))
; Main lexer routine, which is given a port and does look for the next token.
(define (lex port)
(let ((return (let ((file (if (file-port? port) (port-filename port) #f))
(line (1+ (port-line port)))
(column (1+ (port-column port))))
(lambda (token value)
(let ((obj (cons token value)))
(set-source-property! obj 'filename file)
(set-source-property! obj 'line line)
(set-source-property! obj 'column column)
obj))))
; Read afterwards so the source-properties are correct above
; and actually point to the very character to be read.
(c (read-char port)))
(cond
; End of input must be specially marked to the parser.
((eof-object? c) '*eoi*)
; Whitespace, just skip it.
((char-whitespace? c) (lex port))
; The dot is only the one for dotted lists if followed by
; whitespace. Otherwise it is considered part of a number of symbol.
((and (char=? c #\.)
(char-whitespace? (peek-char port)))
(return 'dot #f))
; Continue checking for literal character values.
(else
(case c
; A line comment, skip until end-of-line is found.
((#\;)
(let iterate ()
(let ((cur (read-char port)))
(if (or (eof-object? cur) (char=? cur #\newline))
(lex port)
(iterate)))))
; Parentheses and other special-meaning single characters.
((#\() (return 'paren-open #f))
((#\)) (return 'paren-close #f))
((#\[) (return 'square-open #f))
((#\]) (return 'square-close #f))
((#\') (return 'quote #f))
((#\`) (return 'backquote #f))
((#\,) (return 'unquote #f))
; Remaining are numbers and symbols. Process input until next
; whitespace is found, and see if it looks like a number
; (float/integer) or symbol and return accordingly.
(else
(unread-char c port)
(call-with-values
(lambda ()
(get-symbol-or-number port))
(lambda (type str)
(case type
((symbol) (return 'symbol (string->symbol str)))
((integer)
; In elisp, something like "1." is an integer, while
; string->number returns an inexact real. Thus we
; need a conversion here, but it should always result in
; an integer!
(return 'integer
(let ((num (inexact->exact (string->number str))))
(if (not (integer? num))
(error "Expected integer" str num))
num)))
((float)
(return 'float (let ((num (string->number str)))
(if (exact? num)
(error "Expected inexact float" str num))
num)))
(else (error "Wrong number/symbol type" type)))))))))))
; Build a lexer thunk for a port. This is the exported routine which can be
; used to create a lexer for the parser to use.
(define (get-lexer port)
(lambda ()
(lex port)))

View file

@ -33,6 +33,7 @@ SCM_TESTS = tests/alist.test \
tests/continuations.test \
tests/elisp.test \
tests/elisp-compiler.text \
tests/elisp-reader.text \
tests/environments.test \
tests/eval.test \
tests/exceptions.test \

View file

@ -0,0 +1,87 @@
;;;; elisp-reader.test --- Test the reader used by the Elisp compiler.
;;;;
;;;; Copyright (C) 2009 Free Software Foundation, Inc.
;;;; Daniel Kraft
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
;;;; version 3 of the License, or (at your option) any later version.
;;;;
;;;; This library is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;;; Lesser General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Lesser General Public
;;;; License along with this library; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
(define-module (test-elisp-reader)
:use-module (test-suite lib)
:use-module (language elisp lexer))
; ==============================================================================
; Test the lexer.
; This is of course somewhat redundant with the full parser checks, but probably
; can't hurt and is useful in developing the lexer itself.
(define (get-string-lexer str)
(call-with-input-string str get-lexer))
(define (lex-string str)
(let ((lexer (get-string-lexer str)))
(let iterate ((result '()))
(let ((token (lexer)))
(if (eq? token '*eoi*)
(reverse result)
(iterate (cons token result)))))))
(with-test-prefix "Lexer"
(let ((lexer (get-string-lexer "")))
(pass-if "end-of-input"
(and (eq? (lexer) '*eoi*)
(eq? (lexer) '*eoi*)
(eq? (lexer) '*eoi*))))
(pass-if "single character tokens"
(equal? (lex-string "()[]'`, . ")
'((paren-open . #f) (paren-close . #f)
(square-open . #f) (square-close . #f)
(quote . #f) (backquote . #f) (unquote . #f) (dot . #f))))
(pass-if "whitespace and comments"
(equal? (lex-string " (\n\t) ; this is a comment\n. ; until eof")
'((paren-open . #f) (paren-close . #f) (dot . #f))))
(pass-if "source properties"
(let ((x (car (lex-string "\n\n \n . \n"))))
(and (= (source-property x 'line) 4)
(= (source-property x 'column) 3))))
(pass-if "symbols"
(equal? (lex-string "foo FOO char-to-string 1+ \\+1
\\(*\\ 1\\ 2\\)
+-*/_~!@$%^&=:<>{}
abc(def)ghi .e5")
`((symbol . foo) (symbol . FOO) (symbol . char-to-string)
(symbol . 1+) (symbol . ,(string->symbol "+1"))
(symbol . ,(string->symbol "(* 1 2)"))
(symbol . +-*/_~!@$%^&=:<>{})
(symbol . abc) (paren-open . #f) (symbol . def)
(paren-close . #f) (symbol . ghi) (symbol . .e5))))
; Here we make use of the property that exact/inexact numbers are not equal?
; even when they have the same numeric value!
(pass-if "integers"
(equal? (lex-string "-1 1 1. +1 01234")
'((integer . -1) (integer . 1) (integer . 1) (integer . 1)
(integer . 1234))))
(pass-if "floats"
(equal? (lex-string "1500.0 15e2 15.e2 1.5e3 .15e4 -.345e-2")
'((float . 1500.0) (float . 1500.0) (float . 1500.0)
(float . 1500.0) (float . 1500.0)
(float . -0.00345)))))