Start implementing the webscraping part
This commit is contained in:
parent
04635d2758
commit
74f63c1269
1 changed files with 50 additions and 0 deletions
50
src/ldview/tasks/ldscrape.clj
Normal file
50
src/ldview/tasks/ldscrape.clj
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
(ns ldview.ldscrape
|
||||||
|
(:require [net.cgrand.enlive-html :as html])
|
||||||
|
(:use [clojure.string :only (split)]))
|
||||||
|
|
||||||
|
(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/")
|
||||||
|
(def ^:dynamic *competition* 27)
|
||||||
|
|
||||||
|
(defn url-action [action]
|
||||||
|
(str *base-url* "/ludum-dare-" *competition* "/?action=" action))
|
||||||
|
|
||||||
|
(defn url-page [page]
|
||||||
|
(str (url-action "preview") "&start=" page))
|
||||||
|
|
||||||
|
(defn url-entry [entry-id]
|
||||||
|
(str (url-action "preview") "&uid=" entry-id))
|
||||||
|
|
||||||
|
|
||||||
|
(defn fetch-url [url]
|
||||||
|
(html/html-resource (java.net.URL. url)))
|
||||||
|
|
||||||
|
(defn number-of-pages []
|
||||||
|
(let [p (second (html/select (fetch-url (url-action "preview")) [:div#compo2 :> :p]))]
|
||||||
|
(read-string (html/text (last (butlast (html/select [p] [:a])))))))
|
||||||
|
|
||||||
|
(defn entries-on-page [page]
|
||||||
|
(let [tds (html/select (fetch-url (url-page page)) #{[:.alt-1], [:.alt-2]})
|
||||||
|
links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))]
|
||||||
|
(map #(last (split %1 #"=")) links)))
|
||||||
|
|
||||||
|
(defn links-on-entry [content]
|
||||||
|
(map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select [content] [:p.links :> :a])))
|
||||||
|
|
||||||
|
(defn images-on-entry [content]
|
||||||
|
(map #(:href (:attrs %1)) (html/select [content] [:table html/first-child :a])))
|
||||||
|
|
||||||
|
(defn entry-details [entry-id]
|
||||||
|
(let [content (first (html/select (fetch-url (url-entry entry-id)) [:div#compo2]))
|
||||||
|
title-parts (first (html/texts (html/select [content] [:h3])))
|
||||||
|
[title author] (split title-parts #" - ")
|
||||||
|
links (links-on-entry content)
|
||||||
|
description (html/text (nth (html/select [content] [:p]) 2))
|
||||||
|
images (images-on-entry content)]
|
||||||
|
{:title title
|
||||||
|
:author author
|
||||||
|
:description description
|
||||||
|
:links links
|
||||||
|
:images images}))
|
||||||
|
|
||||||
|
(defn fetch-page [page]
|
||||||
|
(map entry-details (entries-on-page 1)))
|
Loading…
Reference in a new issue