From 74f63c1269a44edc29136365420bdfc84268c85f Mon Sep 17 00:00:00 2001 From: Aaron Mueller Date: Sat, 28 Sep 2013 12:34:44 +0200 Subject: [PATCH] Start implementing the webscraping part --- src/ldview/tasks/ldscrape.clj | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/ldview/tasks/ldscrape.clj diff --git a/src/ldview/tasks/ldscrape.clj b/src/ldview/tasks/ldscrape.clj new file mode 100644 index 0000000..daa9e78 --- /dev/null +++ b/src/ldview/tasks/ldscrape.clj @@ -0,0 +1,50 @@ +(ns ldview.ldscrape + (:require [net.cgrand.enlive-html :as html]) + (:use [clojure.string :only (split)])) + +(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/") +(def ^:dynamic *competition* 27) + +(defn url-action [action] + (str *base-url* "/ludum-dare-" *competition* "/?action=" action)) + +(defn url-page [page] + (str (url-action "preview") "&start=" page)) + +(defn url-entry [entry-id] + (str (url-action "preview") "&uid=" entry-id)) + + +(defn fetch-url [url] + (html/html-resource (java.net.URL. url))) + +(defn number-of-pages [] + (let [p (second (html/select (fetch-url (url-action "preview")) [:div#compo2 :> :p]))] + (read-string (html/text (last (butlast (html/select [p] [:a]))))))) + +(defn entries-on-page [page] + (let [tds (html/select (fetch-url (url-page page)) #{[:.alt-1], [:.alt-2]}) + links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))] + (map #(last (split %1 #"=")) links))) + +(defn links-on-entry [content] + (map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select [content] [:p.links :> :a]))) + +(defn images-on-entry [content] + (map #(:href (:attrs %1)) (html/select [content] [:table html/first-child :a]))) + +(defn entry-details [entry-id] + (let [content (first (html/select (fetch-url (url-entry entry-id)) [:div#compo2])) + title-parts (first (html/texts (html/select [content] [:h3]))) + [title author] (split title-parts #" - ") + links (links-on-entry content) + description (html/text (nth (html/select [content] [:p]) 2)) + images (images-on-entry content)] + {:title title + :author author + :description description + :links links + :images images})) + +(defn fetch-page [page] + (map entry-details (entries-on-page 1)))