-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.ml
139 lines (129 loc) · 4.03 KB
/
main.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
open Lwt
open Cohttp_lwt_unix
open Soup
(*
TODO
- path relative to subdirs (misses gardening pages on vaughan.kitchen)
*)
(*
let sites: string list =
[ "http://vaughan.kitchen"
; "http://ambersong.me"
]
*)
let sites: string list =
[ "https://thegentlechef.com"
; "https://spicysouthernkitchen.com"
; "https://lazycatkitchen.com"
; "https://www.asaucykitchen.com"
; "https://www.vegrecipesofindia.com"
; "https://thecaspianchef.com"
; "https://persianmama.com"
; "https://www.unicornsinthekitchen.com"
; "https://thestonesoup.com"
; "https://www.afamilyfeast.com"
; "https://hilahcooking.com"
; "https://www.gimmesomeoven.com"
; "https://www.chilipeppermadness.com" (* key too small *)
]
let cons_uniq xs x =
if List.mem x xs then
xs
else
x :: xs
let dedupe xs =
List.rev (List.fold_left cons_uniq [] xs)
let links (page : string) : string list =
let soup = parse page in
soup $$ "a[href]"
|> to_list
|> List.map (fun a -> a |> R.attribute "href")
let mkdir_p (path : string) : bool =
if Sys.file_exists path && Sys.is_directory path then
true
else if Sys.file_exists path then begin
print_endline ("Failed to create path '" ^ path ^ "' already exists");
false
end else
let chunks = Str.split (Str.regexp "/+") path in
match chunks with
| [] -> false
| x :: tl ->
if tl = [] then
if not (Sys.file_exists x) then begin
Unix.mkdir x 0o777;
true
end else
false (* ?? *)
else
List.fold_left (fun x xs ->
let res = x ^ "/" ^ xs in
(if not (Sys.file_exists res) then
Unix.mkdir res 0o777
else
());
res
) x tl <> "" (* bad hack *)
let write_file (site : string) (body : string): unit =
let site_uri = Uri.of_string site in
let host = match Uri.host site_uri with
| None -> ""
| Some x -> x
in
let path = Uri.path site_uri in
if path = "" || host = "" then
()
else
let chunks = Str.split (Str.regexp "/+") path in
match List.rev chunks with
| [] -> ()
| x :: tl ->
let tl = List.rev tl in
let dir = String.concat "/" (host :: tl) in
if mkdir_p dir then
let filename = dir ^ "/" ^ x ^ ".ccml" in
if Sys.file_exists filename then
print_endline ("Failed to write '" ^ filename ^ "' file already exists")
else
let fh = open_out filename in
Printf.fprintf fh "%s\n" body;
close_out fh
else
() (* should be error? *)
let fetch (domain : string) (site : string) : string list t =
let site_uri = Uri.of_string site in
try%lwt Client.get site_uri >>= fun (_, body) ->
body |> Cohttp_lwt.Body.to_string >>= fun b ->
write_file site b; (* write out html *)
links b
|> List.filter (fun l -> String.index_opt l '#' <> Some 0) (* remove fragment URIs *)
|> List.filter (fun l -> match Uri.host (Uri.of_string l) with
| None -> true
| Some d -> d = domain
) (* remove external URIs *)
|> List.map (fun l -> Uri.with_path site_uri (Uri.path (Uri.of_string l))) (* TODO fix bug with query fragments *)
|> List.map Uri.canonicalize
|> List.map Uri.to_string
|> Lwt.return
with _ ->
print_endline ("Failed fetching: " ^ site);
Lwt.return []
let rec scrape (domain : string) (fetched : string list) (queue : string list) : unit t =
match queue with
| [] -> Lwt.return ()
| x :: tl ->
Unix.sleep 1;
print_endline x;
(*
List.iter (fun l -> print_endline (" F " ^ l)) fetched;
List.iter (fun l -> print_endline (" Q " ^ l)) tl;
*)
let%lwt q = fetch domain x in
let q = List.filter (fun l -> l <> x && not (List.exists (fun l_ -> l = l_) fetched)) q in
scrape domain (x :: fetched) (dedupe (tl @ q))
let () =
Lwt_main.run (Lwt.join (List.map (fun s ->
match Uri.host (Uri.of_string s) with
| None -> Lwt.return ()
| Some d -> scrape d [] (s :: [])) sites
))