forked from YannickChevalier/hevea-mathjax
-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlparse.ml
129 lines (113 loc) · 4.01 KB
/
htmlparse.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
(***********************************************************************)
(* *)
(* HEVEA *)
(* *)
(* Luc Maranget, projet Moscova, INRIA Rocquencourt *)
(* *)
(* Copyright 2001 Institut National de Recherche en Informatique et *)
(* Automatique. Distributed only by permission. *)
(* *)
(* $Id: htmlparse.ml,v 1.11 2008-01-22 18:08:37 maranget Exp $ *)
(***********************************************************************)
open Lexeme
open Tree
exception Error of string
module Make(C:DoOut.Config) = struct
let error msg _lb = raise (Error msg)
;;
module Out = DoOut.Make(C)
module Lex = Htmllex.Make(C)
let buff = ref None
let next_token lexbuf = match !buff with
| Some tok -> buff := None ; tok
| None -> Lex.next_token lexbuf
and put_back lexbuf tok = match !buff with
| None -> buff := Some tok
| _ -> error "Put back" lexbuf
let txt_buff = Out.create_buff ()
let rec to_close tag lb = match next_token lb with
| Close (t,_) as tok when t=tag -> tok
| Open (t,_,txt) when t=tag ->
Out.put txt_buff txt ;
Out.put txt_buff (Htmllex.to_string (to_close tag lb)) ;
to_close tag lb
| Eof -> error ("Eof in to_close") lb
| tok ->
Out.put txt_buff (Htmllex.to_string tok);
to_close tag lb
let rec tree cls lexbuf =
match next_token lexbuf with
| (Eof|Close (_,_)) as tok-> put_back lexbuf tok ; None
| Open (STYLE,_,txt) ->
let otxt = txt
and ctxt = Htmllex.to_string (to_close STYLE lexbuf) in
let txt = Out.to_string txt_buff in
let txt = match cls with
| None -> txt
| Some cls ->
let css = Lex.styles (MyLexing.from_string txt) in
let buff = Out.create_buff () in
Out.put_char buff '\n' ;
List.iter
(fun cl -> match cl with
| Css.Other txt ->
Out.put buff txt ;
Out.put_char buff '\n'
| Css.Class (name, addname, txt) ->
if Emisc.Strings.mem name cls then begin
Out.put_char buff '.' ;
Out.put buff name ;
begin match addname with
| None -> ()
| Some n ->
Out.put_char buff ' ' ;
Out.put buff n
end ;
Out.put buff txt ;
Out.put_char buff '\n'
end)
css ;
Out.to_string buff in
Some (Text (otxt^txt^ctxt))
| Open (SCRIPT,_,txt) ->
Out.put txt_buff txt ;
Out.put txt_buff (Htmllex.to_string (to_close SCRIPT lexbuf)) ;
Some (Text (Out.to_string txt_buff))
| Open (tag,attrs,txt) ->
let fils = trees cls lexbuf in
begin match next_token lexbuf with
| Close (ctag,ctxt) when tag=ctag ->
Some
(match tag with
| A|SUP|SUB ->
ONode (txt,ctxt,fils)
| _ ->
Node
({tag=tag ; attrs=attrs ; txt=txt ; ctxt=ctxt},fils))
| tok ->
error (Htmllex.to_string tok ^ " closes "^txt) lexbuf
end
| Lexeme.Text txt -> Some (Text txt)
| Lexeme.Blanks txt -> Some (Blanks txt)
and trees cls lexbuf = match tree cls lexbuf with
| None -> []
| Some t -> t::trees cls lexbuf
let rec do_main cls lexbuf = match tree cls lexbuf with
| None ->
begin match next_token lexbuf with
| Eof -> []
| tok -> error ("Unexpected " ^ Htmllex.to_string tok) lexbuf
end
| Some (Text _ as last) -> [last]
| Some t -> t :: do_main cls lexbuf
let ptop () = Lex.ptop ()
let reset () =
Lex.reset() ;
Out.reset txt_buff
let main cls lexbuf =
try
do_main cls lexbuf
with
| e -> reset () ; raise e
let classes = Lex.classes
end