Write a runner for the web scraping stuff

This commit is contained in:
Aaron Mueller 2013-09-28 23:42:15 +02:00
parent d2d60cf48d
commit 452e3d3245
3 changed files with 43 additions and 12 deletions

View file

@ -0,0 +1,37 @@
(ns ldview.tasks.runner
(:require [ldview.tasks.scrape :as scrape]
[ldview.tasks.images :as images]
[me.raynes.fs :as fs])
(:use [ldview.util]))
(defn cleanup []
(if (fs/exists? *image-base-path*)
(fs/delete-dir *image-base-path*))
(fs/mkdirs (str *image-base-path* "/thumbs/"))
(fs/mkdirs (str *image-base-path* "/fullscreen/"))
(fs/mkdirs (str *image-base-path* "/raw/")))
(defn save-entry [entry]
; TODO: Save to Database
(if (:images entry)
(map (fn [image-url]
(let [id (:id entry)
number (last (first (re-seq #"shot([0-9]+)" image-url)))
raw-image-path (images/image-name "raw" number)]
(scrape/save-image-from-url image-url raw-image-path)
(images/sourceimage->fullscreen raw-image-path (images/image-name "fullscreen" number))
(images/sourceimage->thumb raw-image-path (images/image-name "thumbs" number))))
(:images entry))))
; TODO: Make it multithreaded
(defn fetch-all-content []
(let [pages 1] ;(scrape/number-of-pages)]
(map (fn [page]
(map save-entry (scrape/fetch-page 1)))
(range pages))))
(defn start []
(cleanup)
(fetch-all-content))

View file

@ -22,9 +22,9 @@
(defn fetch-url [url] (defn fetch-url [url]
(html/html-resource (java.net.URL. url))) (html/html-resource (java.net.URL. url)))
(defn save-image-from-url [url target-path new-name] (defn save-image-from-url [url target-file]
(with-open [bodystream (:body (http/get url {:as :stream}))] (with-open [bodystream (:body (http/get url {:as :stream}))]
(io/copy bodystream (io/file (str target-path new-name))))) (io/copy bodystream (io/file target-file))))
; The actual scraping process. We crawl through the entire content results ; The actual scraping process. We crawl through the entire content results
@ -51,16 +51,12 @@
links (links-on-entry content) links (links-on-entry content)
description (html/text (nth (html/select [content] [:p]) 2)) description (html/text (nth (html/select [content] [:p]) 2))
images (images-on-entry content)] images (images-on-entry content)]
{:title title {:id entry-id
:title title
:author author :author author
:description description :description description
:links links :links links
:images images})) :images images}))
; Do the job
; TODO: Make it multithreaded and resumable.
(defn fetch-page [page] (defn fetch-page [page]
(map entry-details (entries-on-page page))) (map entry-details (entries-on-page page)))
(defn fetch-all []
(map fetch-page (number-of-pages)))

View file

@ -1,9 +1,7 @@
(ns ldview.util (ns ldview.util)
(:require [noir.io :as io]
[markdown.core :as md]))
; Global stuff that does not change over time and needed everywhere ; Global stuff that does not change over time and needed everywhere
(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/") (def ^:dynamic *base-url* "http://www.ludumdare.com/compo/")
(def ^:dynamic *image-base-path* "content/images") (def ^:dynamic *image-base-path* "resources/public/img/ld")
(def ^:dynamic *competition* 27) (def ^:dynamic *competition* 27)