(load (string-append laml-dir "tools/xml-html-support/" "xml-support.scm"))
(define (parse-html-file in-file-path out-file-path)
(let* ((init-in-path (file-name-initial-path in-file-path))
(in-file-name-prop (file-name-proper in-file-path))
(in-ext (file-name-extension in-file-path))
)
(reset-xml-parser)
(let* ((input-port (open-input-file (string-append init-in-path in-file-name-prop "." (if (empty-string? in-ext) "html" in-ext)))))
(set! ip input-port)
(let ((parse-tree (parse-html-ip))
(target-file-name out-file-path))
(set! resulting-parse-tree parse-tree)
(if (file-exists? target-file-name) (delete-file target-file-name))
(let ((op (open-output-file target-file-name)))
(write parse-tree op)
(close-output-port op)))
(display-message (string-append "DONE. The parse tree is in " out-file-path))
(display-message "Use (scheme-pp <file>) to pretty pring the parse tree.")
(display-message "The result is also in the variable resulting-parse-tree for interactive use.")
(close-input-port ip))))
(define (parse-html file-path)
(let ((init-path (file-name-initial-path file-path))
(file-name-prop (file-name-proper file-path))
(ext (file-name-extension file-path))
)
(reset-xml-parser)
(let* ((input-port (open-input-file (string-append init-path file-name-prop "." (if (empty-string? ext) "html" ext)))))
(set! ip input-port)
(let ((parse-tree (parse-html-ip)))
(close-input-port ip)
parse-tree))))
(define (parse-html-ip)
(if (not end-of-file?) (skip-white-space))
(if (not end-of-file?)
(parse-iteratively-html)
'()
)
)
(define (parse-iteratively-html)
(parse-message "Parsing html iteratively.")
(if (not end-of-file?) (skip-white-space))
(cond ((and end-of-file? (not (parse-stack-empty?)) (>= (length parse-stack) 1))
(aggregate-final-parse-tree 'html-tree))
((and (not end-of-file?) (eq? 'tag (what-is-ahead)))
(let* ((tag (read-tag))
(kind (kind-of-tag-structure tag)))
(cond ((eq? kind 'start)
(parse-stack-push tag)
(parse-iteratively-html)
)
((eq? kind 'start-end)
(parse-stack-push tag)
(parse-iteratively-html)
)
((eq? kind 'end)
(let ((tree (build-html-tree-from-stack (tag-of-tag-structure tag))))
(parse-stack-push tree)
(parse-iteratively-html)))
(else (laml-error "parse-iteratively-html: Unknown kind of tag" kind))
)))
((and (not end-of-file?) (eq? 'contents-string (what-is-ahead)))
(let ((contents-string (read-contents-string #f)))
(parse-stack-push contents-string)
(parse-iteratively-html)))
((and (not end-of-file?) (eq? 'comment (what-is-ahead)))
(let ((comment-string (read-comment)))
(parse-stack-push (make-comment-structure comment-string))
(parse-iteratively-html)))
((and (not end-of-file?) (eq? 'declaration (what-is-ahead)))
(let ((declaration-structure (read-declaration)))
(parse-stack-push
(make-declaration-structure
(kind-of-declaration-structure declaration-structure)
(value-of-declaration-structure declaration-structure)))
(parse-iteratively-html)))
((and (not end-of-file?) (eq? 'xml-declaration (what-is-ahead)))
(let ((declaration-structure (read-xml-declaration)))
(parse-stack-push
(make-xml-declaration-structure declaration-structure))
(parse-iteratively-html)))
(else (parse-error "parse-iteratively-html: Parse problem."))))
(define (build-html-tree-from-stack end-tag-name)
(parse-message "building tree: " end-tag-name)
(build-html-tree-from-stack-1 end-tag-name '()))
(define non-end-tags (list "meta" "base" "isindex" "frame" "th" "td" "tr" "col" "colgroup" "tfoot" "thead" "option" "input" "li" "dd" "dt" "p" "hr" "param" "img" "link" "area" "br" "basefont"))
(define (build-html-tree-from-stack-1 end-tag-name tree-list)
(let ((top (parse-stack-top-and-pop)))
(cond ((and (start-tag-entry? top) (matches-stack-entry top end-tag-name))
(make-parse-tree top tree-list))
((and (start-tag-entry? top) (not (member (downcase-string (tag-of-tag-structure top)) non-end-tags)))
(make-parse-tree top tree-list))
((and (start-tag-entry? top))
(build-html-tree-from-stack-1 end-tag-name (cons (make-it-empty top) tree-list)))
(else (build-html-tree-from-stack-1 end-tag-name (cons top tree-list))))))
(define (make-it-empty tag)
(make-tag-structure 'start-end (tag-of-tag-structure tag) (attributes-of-tag-structure tag)))
(define (pretty-print-html-parse-tree-file in-file-path . optional-parameters)
(let ((out-file-path (optional-parameter 1 optional-parameters in-file-path)))
(let ((parse-tree (file-read in-file-path)))
(write-text-file
(pretty-print-html-parse-tree parse-tree)
out-file-path))))
(define (pretty-print-html-parse-tree parse-tree)
(set! res '())
(pretty-print-xml-html-parse-tree-1 parse-tree 0 #f)
(linearize-pp-result (reverse res)))
(define (pretty-print-empty-tag tag-structure start-col single-lining?)
(pretty-print-start-tag tag-structure start-col single-lining?))
(define (white-space-preserving-context? parse-stack)
(find-in-list
(lambda (x)
(and (tag-entry? x) (equal? "pre" (downcase-string (tag-of-tag-structure x)))))
parse-stack))
(define (parse-html-string str)
(let* ((name "html-temp.html")
(temp-file-path (string-append (laml-temp-file-path) name))
(tree #f)
)
(if (file-exists? temp-file-path) (delete-file temp-file-path))
(write-text-file str temp-file-path)
(set! tree (parse-html temp-file-path))
(delete-file temp-file-path)
tree))