From c247774256523d75937c94aa2dade84cee5b29e5 Mon Sep 17 00:00:00 2001 From: Travis Shears Date: Fri, 18 Jul 2025 12:40:41 +0200 Subject: [PATCH] init blue sky scraping --- config.sample.edn | 9 +++- src/micro_blog/blue_sky.clj | 83 ++++++++++++++++++++++++++++++++++ src/micro_blog/pocket_base.clj | 71 +++++++++++++++++++++++++++++ src/micro_blog/utils.clj | 9 ++++ 4 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 src/micro_blog/blue_sky.clj create mode 100644 src/micro_blog/pocket_base.clj create mode 100644 src/micro_blog/utils.clj diff --git a/config.sample.edn b/config.sample.edn index e4025ef..4aa25de 100644 --- a/config.sample.edn +++ b/config.sample.edn @@ -1,3 +1,10 @@ {:mistral-api-key "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" :mistral-agent-id "ag:xxxxxxxx:xxxxxxxx:xxxxxxxxxxxxxxxxxxx:xxxxxxxx" - :mistral-host "https://api.mistral.ai"} + :mistral-host "https://api.mistral.ai" + + :pocket-base-pw "xxxxxxxx" + :pocket-base-user "xxxxxxxxxxxxxxxxxxx" + :pocket-base-host "xxxxxxxxxxxxx" + + :blue-sky-api-key "xxxxxxxxxxxxxxxxxxx" + :blue-sky-username "coolguy.bsky.social"} diff --git a/src/micro_blog/blue_sky.clj b/src/micro_blog/blue_sky.clj new file mode 100644 index 0000000..6f7bd8f --- /dev/null +++ b/src/micro_blog/blue_sky.clj @@ -0,0 +1,83 @@ +(ns micro-blog.blue-sky + (:require + [clj-http.client :as http-client] + [micro-blog.pocket-base :as pb] + [micro-blog.utils :as utils] + [malli.core :as m] + [clojure.pprint :refer [pprint]] + [micro-blog.config :refer [config]])) + +(defn create-session [] + (let [identifier (@config :blue-sky-username) + api-key (@config :blue-sky-api-key) + body {:identifier identifier :password api-key} + url (str (@config :blue-sky-host) "/com.atproto.server.createSession") + res-schema [:map + [:did string?] + [:accessJwt string?]]] + (-> (http-client/post url + {:form-params body + :content-type :json + :as :json}) + :body + (utils/validate-with-throw res-schema) + (#(assoc % :access-jwt (:accessJwt %))) + (select-keys [:did :access-jwt])))) + +(def post-res-schema [:map + [:cursor [:maybe :string]] + [:feed [:vector + [:map [:post [:map + [:cid :string] + [:author [:map + [:handle :string]]] + + [:record [:map + [:createdAt :string]]]]]]]]]) + +(defn get-posts-until-id + ([session id] (get-posts-until-id session id nil [])) + ([session id cursor prev-posts] + (let [limit 5 + body + (-> (http-client/get (str (@config :blue-sky-host) "/app.bsky.feed.getAuthorFeed") + {:headers {"Authorization" (str "Bearer " (session :access-jwt))} + :query-params (cond-> {:actor (:did session) + :limit limit} + cursor (assoc :cursor cursor)) + :content-type :json + :as :json}) + :body + (utils/validate-with-throw post-res-schema)) + posts (map :post (:feed body)) + new-cursor (:cursor body) + new-posts (take-while #(not= (:cid %) id) posts) + new-and-prev-posts (concat new-posts prev-posts)] + (cond + ;; end of posts + (not= (count posts) limit) new-and-prev-posts + ;; found post + (some #(= id (:cid %)) posts) new-and-prev-posts + ;; recur + :else (recur session id new-cursor new-and-prev-posts))))) + +;; TODO: create post +;; const data = { +;; "remoteId": "test", +;; "authorId": "test", +;; "posted": "2022-01-01 10:00:00.123Z", +;; "source": "pleroma", +;; "tags": [ +;; "RELATION_RECORD_ID" +;; ], +;; "fullPost": "JSON", +;; "images": [ +;; "RELATION_RECORD_ID" +;; ] +;; }; + +(defn run [] + (let [session (create-session) + last-saved-id (pb/get-latest-post-remote-id-by-source :blue_sky) + new-posts (get-posts-until-id session last-saved-id)] + {:session session :last-saved-id last-saved-id :new-posts new-posts})) diff --git a/src/micro_blog/pocket_base.clj b/src/micro_blog/pocket_base.clj new file mode 100644 index 0000000..5a027ae --- /dev/null +++ b/src/micro_blog/pocket_base.clj @@ -0,0 +1,71 @@ +(ns micro-blog.pocket-base + (:require + [clojure.pprint :refer [pprint]] + [clojure.string :as str] + [clj-http.client :as http-client] + [malli.core :as m] + [micro-blog.config :refer [config]])) + +(defonce token-cache ^:private (atom {:token nil :fetched-at nil})) +(defn- now [] + (java.time.Instant/now)) + +(defn older-then-a-day? [fetched-at] + (when fetched-at + (let [duration (java.time.Duration/between fetched-at (now))] + (> (.toHours duration) 23)))) ; 23 to be safe, or use 24 + +(defn get-login-token [] + (let [user-name (@config :pocket-base-user) + pw (@config :pocket-base-pw) + body {:identity user-name :password pw} + url (str (@config :pocket-base-host) "/api/collections/users/auth-with-password")] + (-> + (http-client/post url + {:form-params body + :content-type :json + :as :json}) + :body + :token))) + +(defn get-login-token-with-cache [] + (let [{:keys [token fetched-at]} @token-cache] + (if (and token (not (older-then-a-day? fetched-at))) + token + (let [new-token (get-login-token)] + (println "Getting new login token") + (reset! token-cache {:token new-token :fetched-at (now)}) + new-token)))) + +(def sources #{:pleroma :blue_sky :mastodon :pixelfed :nostr}) +(defn valid-source? [source] + (contains? sources source)) + +(defn get-latest-post-remote-id-by-source [source] + (let [res-schema + [:map + [:items + [:vector + [:map + [:id string?] + [:remoteId string?]]]]]] + (when (not (valid-source? source)) + (throw (ex-info "Invalid source" {:source source}))) + (as-> + (http-client/get (str (@config :pocket-base-host) "/api/collections/micro_blog_posts/records") + {:headers {"Authorization" (get-login-token-with-cache)} + :query-params {:page 1 + "perPage" 1 + :sort "-posted" + :filter (str "source = '" (name source) "'") + :fields (str/join "," ["remoteId" "id"]) + "skipTotal" true} + :content-type :json + :as :json}) x + (:body x) + (if (m/validate res-schema x) + x + (do + (m/explain res-schema x) + (throw (ex-info "Res does not follow schema" {:res x})))) + (-> x :items first :remoteId)))) diff --git a/src/micro_blog/utils.clj b/src/micro_blog/utils.clj new file mode 100644 index 0000000..2d68e46 --- /dev/null +++ b/src/micro_blog/utils.clj @@ -0,0 +1,9 @@ +(ns micro-blog.utils + (:require [malli.core :as m])) + +(defn validate-with-throw [value schema] + (if (m/validate schema value) + value + (do + (m/explain schema value) + (throw (ex-info "Res does not follow schema" {:value value})))))