From 08f504f30386716c530da41b7c60f58696a3694e Mon Sep 17 00:00:00 2001 From: Aaron Fischer Date: Tue, 10 Nov 2020 20:43:04 +0100 Subject: [PATCH] Generalize codebase and add more datasources. Fixes #9 Relates #8 * Support mojoreads as a new data source * Change image file names to hashsums to be more generic * Clean up the source code by separate relates stuff in own files * Fix the Chaos Monkeys book --- BOOK.mkd | 2 +- public/index.html | 14 +++++ src/buchdesmonats/core.clj | 69 +++++++++-------------- src/buchdesmonats/hash.clj | 11 ++++ src/buchdesmonats/sources/lovelybooks.clj | 21 +++++++ src/buchdesmonats/sources/mojoreads.clj | 8 +++ 6 files changed, 82 insertions(+), 43 deletions(-) create mode 100644 public/index.html create mode 100644 src/buchdesmonats/hash.clj create mode 100644 src/buchdesmonats/sources/lovelybooks.clj create mode 100644 src/buchdesmonats/sources/mojoreads.clj diff --git a/BOOK.mkd b/BOOK.mkd index 46c60be..dba2590 100644 --- a/BOOK.mkd +++ b/BOOK.mkd @@ -24,7 +24,7 @@ * September: [Andreas Eschbach "Das größte Abenteuer"](https://www.lovelybooks.de/autor/Andreas-Eschbach/Perry-Rhodan-Das-gr%C3%B6%C3%9Fte-Abenteuer-1955713268-w/) * Oktober: [Christiane-Frohmann "Präraffaelitische Girls erklären das Internet"](https://www.lovelybooks.de/autor/Christiane-Frohmann/Pr%C3%A4raffaelitische-Girls-erkl%C3%A4ren-das-Internet-1499687993-w/) * November: [Helen Stelthove "Pretty Fly For A Wifi"](https://www.lovelybooks.de/autor/Helen-Stelthove/Pretty-Fly-For-A-Wifi-2032951731-w/) -* Dezember: [Antonio Garcia Martinez "Chaos Monkeys"](https://www.lovelybooks.de/autor/Antonio-Garcia-Martinez/Chaos-Monkeys-1565378792-w/) +* Dezember: [Antonio Garcia Martinez "Chaos Monkeys"](https://mojoreads.de/book/Chaos-MonkeysAntonio-Garcia-Martinez/361610) ## 2018 diff --git a/public/index.html b/public/index.html new file mode 100644 index 0000000..1963cdc --- /dev/null +++ b/public/index.html @@ -0,0 +1,14 @@ + + + + + okoyono.de -- Buch des Monats + + + +

Buch des Monats

+ + Comic des Monats + Buch des Monats + + diff --git a/src/buchdesmonats/core.clj b/src/buchdesmonats/core.clj index bc0eaf8..052d4c7 100644 --- a/src/buchdesmonats/core.clj +++ b/src/buchdesmonats/core.clj @@ -1,16 +1,12 @@ -;;; Copyright (C) 2014-2019 Aaron Fischer -;;; -;;; Permission is hereby granted, free of charge, to any person obtaining a copy of -;;; this software and associated documentation files (the "Software"), to deal in -;;; the Software without restriction, including without limitation the rights to -;;; use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -;;; the Software, and to permit persons to whom the Software is furnished to do so, -;;; subject to the following conditions: -;;; -;;; The above copyright notice and this permission notice shall be included in all -;;; copies or substantial portions of the Software. -;;; -;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;;; Copyright (C) 2014-2020 Aaron Fischer Permission +;;; is hereby granted, free of charge, to any person obtaining a copy of this +;;; software and associated documentation files (the "Software"), to deal in the +;;; Software without restriction, including without limitation the rights to +;;; use, copy, modify, merge, publish, distribute, sublicense, and/or sell +;;; copies of the Software, and to permit persons to whom the Software is +;;; furnished to do so, subject to the following conditions: The above +;;; copyright notice and this permission notice shall be included in all copies +;;; or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS ;;; FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR ;;; COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER @@ -24,35 +20,26 @@ [clojure.string :as str] [clojure.java.io :as io] [clojure.tools.logging :as log] - [me.raynes.fs :as fs])) + [me.raynes.fs :as fs] + [buchdesmonats.hash :as hash] + [buchdesmonats.sources.mojoreads :as mojoreads] + [buchdesmonats.sources.lovelybooks :as lovelybooks])) -;; TODO: Make it compatible to other book cover sites -(defn imgurl->bytes [lovelybooks-url] - (let [urls (-> (java.net.URL. lovelybooks-url) - html/html-resource - (html/select [:img.ResponsiveImage.BookCover]) - first - (get-in [:attrs :srcset]) - (str/split #" ")) - url-to-fetch (last (filter #(re-matches #"http.+\.jpg" %) urls)) +(defn bookurl->imageurl [bookurl] + (let [url (java.net.URL. bookurl)] + (case (.getHost url) + "www.lovelybooks.de" (lovelybooks/find-cover-image bookurl) + "lovelybooks.de" (lovelybooks/find-cover-image bookurl) + "mojoreads.com" (mojoreads/find-cover-image bookurl) + "mojoreads.de" (mojoreads/find-cover-image bookurl)))) + +(defn imgurl->bytes [url] + (let [url-to-fetch (bookurl->imageurl url) stream (http-client/get url-to-fetch {:as :byte-array})] (:body stream))) -(defn encode-url-part [part] - (java.net.URLEncoder/encode part "UTF-8")) - -(defn encode-url [url] - (let [parts (map encode-url-part (re-find #"\/autor\/([^/]+)\/([^/]+)\/?" url)) - author (nth parts 1) - book-title (nth parts 2)] - (str "https://lovelybooks.de/autor/" author "/" book-title))) - -(defn url->file [lovelybooks-url target-dir] - (let [[_ author title] (re-find #".\/autor\/([^\/]+)\/(.+)-([0-9]+)-.\/$" lovelybooks-url)] - (-> (str author "_" title ".jpg") - str/lower-case - (str/replace #"[^a-z0-9-_.]" "") - (#(io/file target-dir %))))) +(defn url->file [url target-dir] + (io/file target-dir (str (hash/md5 url) ".jpg"))) (defn scrape-book-urls [datasource-url] (->> (http-client/get datasource-url {:insecure? true}) @@ -64,13 +51,11 @@ (defn scrape-book-cover [url target-dir] (try (let [target-file (url->file url target-dir) - encoded-url (encode-url url) - bytes (imgurl->bytes encoded-url)] + bytes (imgurl->bytes url)] (with-open [out (io/output-stream target-file)] (.write out bytes))) (catch Exception e - ;; TODO: Better exception (examine e?) - (log/info "Problems with " url ", skip it.")))) + (log/info "Problem with " url ":" (get-in e [:via :message]) ". Skip it.")))) (defn find-missing-covers [books-url target-dir] (remove #(fs/exists? (url->file % target-dir)) diff --git a/src/buchdesmonats/hash.clj b/src/buchdesmonats/hash.clj new file mode 100644 index 0000000..f9e2035 --- /dev/null +++ b/src/buchdesmonats/hash.clj @@ -0,0 +1,11 @@ +(ns buchdesmonats.hash + (:import [java.security MessageDigest] + [java.math BigInteger])) + +;;; Stolen from: https://gist.github.com/jizhang/4325757#gistcomment-2633984 +(defn md5 [^String s] + (->> s + .getBytes + (.digest (MessageDigest/getInstance "MD5")) + (BigInteger. 1) + (format "%032x"))) diff --git a/src/buchdesmonats/sources/lovelybooks.clj b/src/buchdesmonats/sources/lovelybooks.clj new file mode 100644 index 0000000..7920373 --- /dev/null +++ b/src/buchdesmonats/sources/lovelybooks.clj @@ -0,0 +1,21 @@ +(ns buchdesmonats.sources.lovelybooks + (:require [net.cgrand.enlive-html :as html] + [clojure.string :as str])) + +(defn encode-url-part [part] + (java.net.URLEncoder/encode part "UTF-8")) + +(defn encode-url [url] + (let [parts (map encode-url-part (re-find #"\/autor\/([^/]+)\/([^/]+)\/?" url)) + author (nth parts 1) + book-title (nth parts 2)] + (str "https://lovelybooks.de/autor/" author "/" book-title))) + +(defn find-cover-image [url] + (let [encoded-url (encode-url url) + src (-> (html/html-resource (java.net.URL. encoded-url)) + (html/select [:img.ResponsiveImage.BookCover]) + first + (get-in [:attrs :srcset]) + (str/split #" "))] + (last (filter #(re-matches #"http.+\.jpg" %) src)))) diff --git a/src/buchdesmonats/sources/mojoreads.clj b/src/buchdesmonats/sources/mojoreads.clj new file mode 100644 index 0000000..ce9d898 --- /dev/null +++ b/src/buchdesmonats/sources/mojoreads.clj @@ -0,0 +1,8 @@ +(ns buchdesmonats.sources.mojoreads + (:require [net.cgrand.enlive-html :as html])) + +(defn find-cover-image [url] + (-> (html/html-resource (java.net.URL. url)) + (html/select [:div.mojoreads-page-content-container :img]) + (first) + (get-in [:attrs :src])))