From 003401b5f1a309b06865ca8c70b008839c2a7cad Mon Sep 17 00:00:00 2001 From: Aaron Mueller Date: Fri, 21 Mar 2014 00:42:43 +0100 Subject: [PATCH] Initial commit --- project.clj | 10 ++++++++++ src/buchdesmonats/core.clj | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 project.clj create mode 100644 src/buchdesmonats/core.clj diff --git a/project.clj b/project.clj new file mode 100644 index 0000000..29bebea --- /dev/null +++ b/project.clj @@ -0,0 +1,10 @@ +(defproject buchdesmonats "1.0" + :description "A simple tool to fetch all books of the months from the cthn.de project" + :url "http://cthn.de/projects/buch_des_monats" + :license {:name "Eclipse Public License" + :url "http://www.eclipse.org/legal/epl-v10.html"} + :dependencies [[org.clojure/clojure "1.5.1"] + [enlive "1.1.5"] + [me.raynes/fs "1.4.4"] + [clj-http "0.9.1"]] + :main buchdesmonats.core) diff --git a/src/buchdesmonats/core.clj b/src/buchdesmonats/core.clj new file mode 100644 index 0000000..ea662e5 --- /dev/null +++ b/src/buchdesmonats/core.clj @@ -0,0 +1,38 @@ +(ns buchdesmonats.core + (:require [net.cgrand.enlive-html :as html] + [clj-http.client :as http-client] + [clojure.string :as str] + [clojure.java.io :as io])) + +(defn imgurl->bytes [lovelybooks-url] + (-> (java.net.URL. lovelybooks-url) + html/html-resource + (html/select [:div.bookcoverXXL :> :div :> :img]) + first + (get-in [:attrs :src]) + (#(str "http:" %)) + (http-client/get {:as :byte-array}) + :body)) + +(defn url->filename [lovelybooks-url] + (let [[_ author title] (re-find #".\/autor\/([^\/]+)\/(.+)-([0-9]+)-.\/$" lovelybooks-url)] + (str/lower-case (str author "_" title ".jpg")))) + + +(defn scrape-book-urls [github-url] + (->> (http-client/get github-url) + :body + str/split-lines + (map #(second (re-find #"^\* .*\[.+\]\((.+)\)" %))) + (remove nil?))) + +(defn scrape-book-cover [url] + (with-open [out (io/output-stream (url->filename url))] + (.write out (imgurl->bytes url)))) + + +(defn -main [& args] + (let [books-url "https://raw.github.com/CTHN/wiki-data/master/pages/projects/buch_des_monats.mkd"] + (doall (pmap #(scrape-book-cover %) + (scrape-book-urls books-url))) + true))