Fix the html cover scraping (lovelybooks changed the html again)

This commit is contained in:
Aaron Fischer 2018-07-13 23:31:00 +02:00
parent 641fcb8712
commit 695afe17db

View file

@ -27,13 +27,16 @@
[me.raynes.fs :as fs])) [me.raynes.fs :as fs]))
(defn imgurl->bytes [lovelybooks-url] (defn imgurl->bytes [lovelybooks-url]
(-> (java.net.URL. lovelybooks-url) (let [urls (-> (java.net.URL. lovelybooks-url)
html/html-resource html/html-resource
(html/select [:div.bookcoverXXL :> :div :> :img]) (html/select [:div.cover-wrapper :> :div.cover :> :img.BookCover])
first first
(get-in [:attrs :src]) (get-in [:attrs :srcset])
(http-client/get {:as :byte-array}) (str/split #" "))
:body)) url-to-fetch (last (filter #(re-matches #"http.+\.jpg" %) urls))
stream (http-client/get url-to-fetch {:as :byte-array})]
(:body stream)))
(defn encode-url-part [part] (defn encode-url-part [part]
(java.net.URLEncoder/encode part "UTF-8")) (java.net.URLEncoder/encode part "UTF-8"))
@ -42,7 +45,7 @@
(let [parts (map encode-url-part (re-find #"\/autor\/([^/]+)\/([^/]+)\/?" url)) (let [parts (map encode-url-part (re-find #"\/autor\/([^/]+)\/([^/]+)\/?" url))
author (nth parts 1) author (nth parts 1)
book-title (nth parts 2)] book-title (nth parts 2)]
(str "https://lovelybooks.de/autor/" author "/" book-title))) (str "https://lovelybooks.de/autor/" author "/" book-title)))
(defn url->file [lovelybooks-url target-dir] (defn url->file [lovelybooks-url target-dir]
(let [[_ author title] (re-find #".\/autor\/([^\/]+)\/(.+)-([0-9]+)-.\/$" lovelybooks-url)] (let [[_ author title] (re-find #".\/autor\/([^\/]+)\/(.+)-([0-9]+)-.\/$" lovelybooks-url)]
@ -87,7 +90,7 @@
(let [content (apply str (index-template book-urls))] (let [content (apply str (index-template book-urls))]
(with-open [out (io/writer (io/file "public" "index.html"))] (with-open [out (io/writer (io/file "public" "index.html"))]
(.write out content)))) (.write out content))))
(defn -main [& args] (defn -main [& args]
(let [datasource-url "https://git.okoyono.de/mezzomix/buch_des_monats/raw/master/README.mkd" (let [datasource-url "https://git.okoyono.de/mezzomix/buch_des_monats/raw/master/README.mkd"
target-dir (io/file "public" "book-covers")] target-dir (io/file "public" "book-covers")]