|
| 1 | +/** |
| 2 | + * 批量下載 podcasts 小说 的工具。 Download Apple Podcast. |
| 3 | + */ |
| 4 | + |
| 5 | +'use strict'; |
| 6 | + |
| 7 | +require('../work_crawler_loader.js'); |
| 8 | + |
| 9 | +// ---------------------------------------------------------------------------- |
| 10 | + |
| 11 | +CeL.run([ 'application.storage.EPUB' |
| 12 | +// CeL.detect_HTML_language() |
| 13 | +, 'application.locale' ]); |
| 14 | + |
| 15 | +// ---------------------------------------------------------------------------- |
| 16 | + |
| 17 | +var crawler = new CeL.work_crawler({ |
| 18 | + // auto_create_ebook, automatic create ebook |
| 19 | + // MUST includes CeL.application.locale! |
| 20 | + need_create_ebook : true, |
| 21 | + // recheck:從頭檢測所有作品之所有章節與所有圖片。不會重新擷取圖片。對漫畫應該僅在偶爾需要從頭檢查時開啟此選項。default:false |
| 22 | + // recheck='changed': 若是已變更,例如有新的章節,則重新下載/檢查所有章節內容。否則只會自上次下載過的章節接續下載。 |
| 23 | + // recheck : 'changed', |
| 24 | + |
| 25 | + // search_work_interval : '2s', |
| 26 | + // chapter_time_interval : '2s', |
| 27 | + |
| 28 | + site_name : 'Podcast', |
| 29 | + |
| 30 | + base_URL : 'https://podcasts.apple.com/', |
| 31 | + |
| 32 | + // 解析 作品名稱 → 作品id get_work() |
| 33 | + search_URL : function(key) { |
| 34 | + return 'https://www.apple.com/tw/search/' + key + '?src=globalnav'; |
| 35 | + }, |
| 36 | + parse_search_result : function(html, get_label) { |
| 37 | + html = html.between(' id="explore"'); |
| 38 | + // console.log(html); |
| 39 | + var id_data = [], |
| 40 | + // {Array}id_list = [id,id,...] |
| 41 | + id_list = []; |
| 42 | + |
| 43 | + html.each_between('<div class="rf-serp-product-description">', null, |
| 44 | + // |
| 45 | + function(text) { |
| 46 | + var matched = text.match(/\/podcast\/([^\/"]+)\/id(\d+)"/); |
| 47 | + if (!matched) |
| 48 | + return; |
| 49 | + var title_id = decodeURIComponent(matched[1]); |
| 50 | + id_list.push(title_id + '-' + matched[2]); |
| 51 | + var title = get_label(text.between( |
| 52 | + '<h2 class="rf-serp-productname">', '</h2>')); |
| 53 | + id_data.push(title); |
| 54 | + if (false && title_id !== title) { |
| 55 | + CeL.error('parse_search_result: Different title! ' |
| 56 | + + JSON.stringify(title) + ', ' |
| 57 | + + JSON.stringify(title_id)); |
| 58 | + } |
| 59 | + }); |
| 60 | + // console.log([ id_list, id_data ]); |
| 61 | + return [ id_list, id_data ]; |
| 62 | + }, |
| 63 | + |
| 64 | + // 取得作品的章節資料。 get_work_data() |
| 65 | + work_URL : function(work_id) { |
| 66 | + var matched = work_id.match(/^(.+)-(\d+)$/); |
| 67 | + return 'tw/podcast/' + matched[1] + '/id' + matched[2]; |
| 68 | + }, |
| 69 | + parse_work_data : function(html, get_label, extract_work_data) { |
| 70 | + // console.trace(html); |
| 71 | + var work_data = { |
| 72 | + // 必要屬性:須配合網站平台更改。 |
| 73 | + /** |
| 74 | + * <code> |
| 75 | + <h2>最仙遊<span>文 / <a href="/fxnlist/虾写.html">虾写</a></span></h2> |
| 76 | + </code> |
| 77 | + */ |
| 78 | + title : get_label(html.between( |
| 79 | + '<span class="product-header__title"', '</span>').between( |
| 80 | + '>')) |
| 81 | + |
| 82 | + // 選擇性屬性:須配合網站平台更改。 |
| 83 | + }; |
| 84 | + |
| 85 | + // console.trace(text); |
| 86 | + Object.assign(work_data, JSON.parse(html.between( |
| 87 | + // |
| 88 | + '<script name="schema:podcast-show" type="application/ld+json">', |
| 89 | + '</script>'))); |
| 90 | + // e.g., |
| 91 | + // https://podcasts.apple.com/tw/podcast/%E4%B8%8B%E4%B8%80%E6%9C%AC%E8%AE%80%E4%BB%80%E9%BA%BC/id1532820533 |
| 92 | + work_data.title = work_data.title.replace(/[]/g, ''); |
| 93 | + |
| 94 | + // 由 meta data 取得作品資訊。 |
| 95 | + // extract_work_data(work_data, html); |
| 96 | + |
| 97 | + // console.log(html); |
| 98 | + // console.log(work_data); |
| 99 | + return work_data; |
| 100 | + }, |
| 101 | + |
| 102 | + get_chapter_list : function(work_data, html, get_label) { |
| 103 | + // <div class="catalog" id="catalog"> |
| 104 | + // <h3>目录</h3> |
| 105 | + |
| 106 | + var data = html.between(' id="shoebox-media-api-cache-amp-podcasts">', |
| 107 | + '</script>'); |
| 108 | + data = JSON.parse(data); |
| 109 | + data = data[Object.keys(data)[0]]; |
| 110 | + data = JSON.parse(data); |
| 111 | + data = data.d; |
| 112 | + data = data[0]; |
| 113 | + data = data.relationships.episodes.data; |
| 114 | + // console.trace(data, Object.keys(data)); |
| 115 | + |
| 116 | + data.forEach(function(chapter_data) { |
| 117 | + chapter_data.title = chapter_data.attributes.name; |
| 118 | + chapter_data.url = chapter_data.attributes.url; |
| 119 | + }); |
| 120 | + // reset work_data.chapter_list |
| 121 | + work_data.chapter_list = data; |
| 122 | + // console.log(work_data.chapter_list); |
| 123 | + }, |
| 124 | + |
| 125 | + pre_parse_chapter_data |
| 126 | + // 執行在解析章節資料 process_chapter_data() 之前的作業 (async)。 |
| 127 | + // 必須自行保證執行 callback(),不丟出異常、中斷。 |
| 128 | + : function(XMLHttp, work_data, callback, chapter_NO) { |
| 129 | + var chapter_data = work_data.chapter_list[chapter_NO - 1]; |
| 130 | + // console.trace(chapter_data); |
| 131 | + |
| 132 | + var directory = work_data.directory + 'media' + CeL.env.path_separator, |
| 133 | + // |
| 134 | + title = chapter_data.title, |
| 135 | + // |
| 136 | + url = decodeURI(chapter_data.attributes.assetUrl), |
| 137 | + // |
| 138 | + extension = url.match(/(\.[^.?]+)(?:\?.*)?$/)[1], |
| 139 | + // |
| 140 | + file_name = directory |
| 141 | + // + chapter_NO.pad(work_data.chapter_NO_pad_digits || 4) + ' ' |
| 142 | + + CeL.to_file_name(title) + extension; |
| 143 | + // console.trace({directory,title,url,extension}); |
| 144 | + CeL.create_directory(directory); |
| 145 | + |
| 146 | + var matched = url.match(/https%3A%2F%2F[^?]+/); |
| 147 | + if (matched) |
| 148 | + url = decodeURIComponent(matched[0]); |
| 149 | + |
| 150 | + CeL.log_temporary('Fetching [' + file_name + '] (' + url + ')...'); |
| 151 | + // CeL.set_debug(9); |
| 152 | + CeL.get_URL_cache(url, function(data, error) { |
| 153 | + callback(); |
| 154 | + }, { |
| 155 | + file_name : file_name, |
| 156 | + encoding : undefined, |
| 157 | + get_URL_options : Object.assign({ |
| 158 | + error_retry : this.MAX_ERROR_RETRY |
| 159 | + }, this.get_URL_options, { |
| 160 | + // 有些檔案比較大,必須花費比較多時間。 |
| 161 | + timeout : 5 * 60 * 1000 |
| 162 | + }) |
| 163 | + }); |
| 164 | + }, |
| 165 | + |
| 166 | + // 取得每一個章節的各個影像內容資料。 get_chapter_data() |
| 167 | + parse_chapter_data : function(html, work_data, get_label, chapter_NO) { |
| 168 | + // console.log(html); |
| 169 | + |
| 170 | + var chapter_data = work_data.chapter_list[chapter_NO - 1]; |
| 171 | + var text = chapter_data.attributes.description.standard; |
| 172 | + |
| 173 | + this.add_ebook_chapter(work_data, chapter_NO, { |
| 174 | + title : chapter_data.title, |
| 175 | + date : html.between('<p class="post-byline">', '<').trim().replace( |
| 176 | + /^\d*$/, ''), |
| 177 | + text : text |
| 178 | + }); |
| 179 | + } |
| 180 | +}); |
| 181 | + |
| 182 | +// ---------------------------------------------------------------------------- |
| 183 | + |
| 184 | +// CeL.set_debug(3); |
| 185 | + |
| 186 | +start_crawler(crawler, typeof module === 'object' && module); |
0 commit comments