From 695afe17db5723fef57f6ff4a0009e5bd64ea1b6 Mon Sep 17 00:00:00 2001 From: Aaron Fischer Date: Fri, 13 Jul 2018 23:31:00 +0200 Subject: [PATCH] Fix the html cover scraping (lovelybooks changed the html again) --- src/buchdesmonats/core.clj | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/buchdesmonats/core.clj b/src/buchdesmonats/core.clj index b1c552a..a713a54 100644 --- a/src/buchdesmonats/core.clj +++ b/src/buchdesmonats/core.clj @@ -27,13 +27,16 @@ [me.raynes.fs :as fs])) (defn imgurl->bytes [lovelybooks-url] - (-> (java.net.URL. lovelybooks-url) - html/html-resource - (html/select [:div.bookcoverXXL :> :div :> :img]) - first - (get-in [:attrs :src]) - (http-client/get {:as :byte-array}) - :body)) + (let [urls (-> (java.net.URL. lovelybooks-url) + html/html-resource + (html/select [:div.cover-wrapper :> :div.cover :> :img.BookCover]) + first + (get-in [:attrs :srcset]) + (str/split #" ")) + url-to-fetch (last (filter #(re-matches #"http.+\.jpg" %) urls)) + stream (http-client/get url-to-fetch {:as :byte-array})] + (:body stream))) + (defn encode-url-part [part] (java.net.URLEncoder/encode part "UTF-8")) @@ -42,7 +45,7 @@ (let [parts (map encode-url-part (re-find #"\/autor\/([^/]+)\/([^/]+)\/?" url)) author (nth parts 1) book-title (nth parts 2)] - (str "https://lovelybooks.de/autor/" author "/" book-title))) + (str "https://lovelybooks.de/autor/" author "/" book-title))) (defn url->file [lovelybooks-url target-dir] (let [[_ author title] (re-find #".\/autor\/([^\/]+)\/(.+)-([0-9]+)-.\/$" lovelybooks-url)] @@ -87,7 +90,7 @@ (let [content (apply str (index-template book-urls))] (with-open [out (io/writer (io/file "public" "index.html"))] (.write out content)))) - + (defn -main [& args] (let [datasource-url "https://git.okoyono.de/mezzomix/buch_des_monats/raw/master/README.mkd" target-dir (io/file "public" "book-covers")]