diff --git a/lib/bdfr_browser/http/plug.ex b/lib/bdfr_browser/http/plug.ex index d9d098e..3566e0a 100644 --- a/lib/bdfr_browser/http/plug.ex +++ b/lib/bdfr_browser/http/plug.ex @@ -1,7 +1,7 @@ defmodule BdfrBrowser.HTTP.Plug do use Plug.Router - alias BdfrBrowser.{Chat, Comment, Message, Repo, Post, Subreddit} + alias BdfrBrowser.{Chat, Comment, Importer, Message, Repo, Post, Subreddit} plug :match plug :dispatch @@ -150,15 +150,20 @@ defmodule BdfrBrowser.HTTP.Plug do end post "/_import" do - :ok = BdfrBrowser.Importer.background_import() + :ok = Importer.background_import() send_resp(conn, 200, "IMPORTING") end post "/_import_changes" do - :ok = BdfrBrowser.Importer.background_import_changes() + :ok = Importer.background_import_changes() send_resp(conn, 200, "IMPORTING CHANGES") end + post "/_cleanup" do + :ok = Importer.cleanup_messages() + send_resp(conn, 200, "CLEANED UP") + end + get "/_ping" do send_resp(conn, 200, "PONG") end diff --git a/lib/bdfr_browser/importer.ex b/lib/bdfr_browser/importer.ex index c1885cb..c846813 100644 --- a/lib/bdfr_browser/importer.ex +++ b/lib/bdfr_browser/importer.ex @@ -5,6 +5,8 @@ defmodule BdfrBrowser.Importer do alias BdfrBrowser.{Chat, Comment, Message, Post, Repo, Subreddit} + @image_extensions [".jpg", ".jpeg", ".gif", ".png", ".webp"] + defmodule State do use TypedStruct @@ -77,6 +79,29 @@ defmodule BdfrBrowser.Importer do List.flatten(result) end + def cleanup_messages do + all_images = Message.images() |> Repo.all() + + dupes = + for image <- all_images, uniq: true do + incorrect_id = + :sha3_256 + |> :crypto.hash([image.chat_id, DateTime.to_iso8601(image.posted_at)]) + |> Base.encode16(case: :lower) + + potential_dupes = Message.potential_duplicates(image) |> Repo.all() + + Enum.filter(potential_dupes, fn msg -> + msg.message == "Image" or + msg.message == "image" or + (msg.id == incorrect_id and String.starts_with?(msg.message, ["mxc://", "https://i.redd.it/"])) or + (String.starts_with?(msg.message, "image") and String.ends_with?(msg.message, @image_extensions)) + end) + end + + for dupe <- List.flatten(dupes), do: Repo.delete(dupe) + end + def background_import do GenServer.cast(__MODULE__, :background_import) end @@ -326,7 +351,7 @@ defmodule BdfrBrowser.Importer do end defp import_message(message, chat) when not is_nil(chat) do - id = :sha3_256 |> :crypto.hash([chat.id, message["timestamp"]]) |> Base.encode16(case: :lower) + id = calculate_message_id(message, chat.id) message_content = message["content"]["Message"] {:ok, posted_at, 0} = DateTime.from_iso8601(message["timestamp"]) @@ -343,7 +368,11 @@ defmodule BdfrBrowser.Importer do conflict_target: :id ) - existing_image = message_record.message == "Image" or String.starts_with?(message_record.message, "image") + existing_image = + message_record.message == "Image" or + message_record.message == "image" or + (String.starts_with?(message_record.message, "image") and + String.ends_with?(message_record.message, @image_extensions)) message_record = if existing_image and String.starts_with?(message_content, "mxc://") do @@ -355,4 +384,15 @@ defmodule BdfrBrowser.Importer do message_record end + + defp calculate_message_id(message, chat_id) do + message_content = message["content"]["Message"] + is_img = String.starts_with?(message_content, ["mxc://", "https://i.redd.it/"]) + + if is_img do + :sha3_256 |> :crypto.hash([chat_id, message["timestamp"], message_content]) |> Base.encode16(case: :lower) + else + :sha3_256 |> :crypto.hash([chat_id, message["timestamp"]]) |> Base.encode16(case: :lower) + end + end end diff --git a/lib/bdfr_browser/message.ex b/lib/bdfr_browser/message.ex index 326a7bc..83480a3 100644 --- a/lib/bdfr_browser/message.ex +++ b/lib/bdfr_browser/message.ex @@ -22,4 +22,18 @@ defmodule BdfrBrowser.Message do order_by: [asc: m.posted_at] ) end + + def images do + from(m in __MODULE__, + where: like(m.message, "mxc://%") or like(m.message, "https://i.redd.it/%"), + order_by: [asc: m.posted_at] + ) + end + + def potential_duplicates(other_m) do + from(m in __MODULE__, + where: m.id != ^other_m.id and m.chat_id == ^other_m.chat_id and m.posted_at == ^other_m.posted_at, + order_by: [asc: m.posted_at] + ) + end end