buch_des_monats/src/buchdesmonats/core.clj

48 lines
1.6 KiB
Clojure
Raw Normal View History

2014-03-21 00:42:43 +01:00
(ns buchdesmonats.core
2014-03-24 21:55:07 +01:00
(:gen-class)
2014-03-21 00:42:43 +01:00
(:require [net.cgrand.enlive-html :as html]
[clj-http.client :as http-client]
[clojure.string :as str]
2014-03-24 21:55:07 +01:00
[clojure.java.io :as io]
[me.raynes.fs :as fs]))
(def config {:books-url "https://raw.github.com/CTHN/wiki-data/master/pages/projects/buch_des_monats.mkd"
:target-dir (io/file "public" "book-covers")})
2014-03-21 00:42:43 +01:00
(defn imgurl->bytes [lovelybooks-url]
(-> (java.net.URL. lovelybooks-url)
html/html-resource
(html/select [:div.bookcoverXXL :> :div :> :img])
first
(get-in [:attrs :src])
(#(str "http:" %))
(http-client/get {:as :byte-array})
:body))
2014-03-24 21:55:07 +01:00
(defn url->file [lovelybooks-url target-dir]
2014-03-21 00:42:43 +01:00
(let [[_ author title] (re-find #".\/autor\/([^\/]+)\/(.+)-([0-9]+)-.\/$" lovelybooks-url)]
2014-03-24 21:55:07 +01:00
(io/file target-dir
(str/lower-case (str author "_" title ".jpg")))))
2014-03-21 00:42:43 +01:00
(defn scrape-book-urls [github-url]
(->> (http-client/get github-url)
:body
str/split-lines
(map #(second (re-find #"^\* .*\[.+\]\((.+)\)" %)))
(remove nil?)))
2014-03-24 21:55:07 +01:00
(defn scrape-book-cover [url target-dir]
(let [target-file (url->file url target-dir)]
(with-open [out (io/output-stream target-file)]
(.write out (imgurl->bytes url)))))
2014-03-21 00:42:43 +01:00
2014-03-24 21:55:07 +01:00
(defn find-missing-covers [books-url target-dir]
(remove #(fs/exists? (url->file % target-dir))
(scrape-book-urls books-url)))
2014-03-21 00:42:43 +01:00
(defn -main [& args]
2014-03-24 21:55:07 +01:00
(fs/mkdirs (:target-dir config))
(doall (pmap #(scrape-book-cover % (:target-dir config))
(find-missing-covers (:books-url config) (:target-dir config))))
true)