Start implementing the webscraping part

This commit is contained in:
Aaron Mueller 2013-09-28 12:34:44 +02:00
parent 04635d2758
commit 74f63c1269

View file

@ -0,0 +1,50 @@
(ns ldview.ldscrape
(:require [net.cgrand.enlive-html :as html])
(:use [clojure.string :only (split)]))
(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/")
(def ^:dynamic *competition* 27)
(defn url-action [action]
(str *base-url* "/ludum-dare-" *competition* "/?action=" action))
(defn url-page [page]
(str (url-action "preview") "&start=" page))
(defn url-entry [entry-id]
(str (url-action "preview") "&uid=" entry-id))
(defn fetch-url [url]
(html/html-resource (java.net.URL. url)))
(defn number-of-pages []
(let [p (second (html/select (fetch-url (url-action "preview")) [:div#compo2 :> :p]))]
(read-string (html/text (last (butlast (html/select [p] [:a])))))))
(defn entries-on-page [page]
(let [tds (html/select (fetch-url (url-page page)) #{[:.alt-1], [:.alt-2]})
links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))]
(map #(last (split %1 #"=")) links)))
(defn links-on-entry [content]
(map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select [content] [:p.links :> :a])))
(defn images-on-entry [content]
(map #(:href (:attrs %1)) (html/select [content] [:table html/first-child :a])))
(defn entry-details [entry-id]
(let [content (first (html/select (fetch-url (url-entry entry-id)) [:div#compo2]))
title-parts (first (html/texts (html/select [content] [:h3])))
[title author] (split title-parts #" - ")
links (links-on-entry content)
description (html/text (nth (html/select [content] [:p]) 2))
images (images-on-entry content)]
{:title title
:author author
:description description
:links links
:images images}))
(defn fetch-page [page]
(map entry-details (entries-on-page 1)))