From cc4d7827adefc292d712b28ef88eb674af99f135 Mon Sep 17 00:00:00 2001 From: Naitian Zhou Date: Wed, 15 Nov 2023 00:03:18 -0500 Subject: [PATCH] Add case studies and data --- memes/analysis/Case Studies.ipynb | 2113 +++++++++++++++++++++++++++++ memes/data/README.md | 3 + 2 files changed, 2116 insertions(+) create mode 100644 memes/analysis/Case Studies.ipynb create mode 100644 memes/data/README.md diff --git a/memes/analysis/Case Studies.ipynb b/memes/analysis/Case Studies.ipynb new file mode 100644 index 0000000..697d789 --- /dev/null +++ b/memes/analysis/Case Studies.ipynb @@ -0,0 +1,2113 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4f8189e8", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7bcd8288", + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "from collections import namedtuple, defaultdict\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6a5c03ef", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":219: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy import stats\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from tqdm.auto import tqdm\n", + "\n", + "plt.rcParams[\"figure.dpi\"] = 300\n", + "\n", + "sns.set()\n", + "\n", + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "abe53b51", + "metadata": {}, + "outputs": [], + "source": [ + "from memes.utils import DATA_DIR, HashClusters, read_id_to_info\n", + "from memes.visualization_utils import draw_grid" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2e2a6e9f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "29082578it [00:46, 631255.51it/s]\n", + "16662070it [00:52, 319238.39it/s]\n", + "10981044it [00:31, 347317.30it/s]\n" + ] + } + ], + "source": [ + "hc = HashClusters(\n", + " \"../data/filepaths/all.tsv\",\n", + " \"../data/imagehashes/all-8-processed-hashes.tsv\",\n", + " \"../data/clusters/all-8-processed-leiden.tsv\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fcaaecc6", + "metadata": {}, + "outputs": [], + "source": [ + "def load_template_clusters(path=DATA_DIR / \"semantic_clusters/all-8-processed-clusters.tsv\"):\n", + " TplCluster = namedtuple(\"TplCluster\", [\"tpl_to_cluster\", \"cluster_to_tpl\"])\n", + "\n", + " tpl_to_cluster = {}\n", + " cluster_to_tpl = defaultdict(set)\n", + " for line in open(path, \"r\"):\n", + " tpl, cluster = line.strip().split(\"\\t\")\n", + " tpl_to_cluster[tpl] = cluster\n", + " cluster_to_tpl[cluster].add(tpl)\n", + " return TplCluster(tpl_to_cluster, dict(cluster_to_tpl))\n", + "\n", + "template_clusters = load_template_clusters(DATA_DIR / \"semantic_clusters/all-8-processed-clusters-norm-50.tsv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fdef8ba6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/157 [00:00= threshold:\n", + " print(\"\\t\" + matrix.idx2word[i] + \"\\t\\t\" + str(lor[i]))" + ] + }, + { + "cell_type": "markdown", + "id": "05eb3821", + "metadata": {}, + "source": [ + "## Does variation exist?" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bf82cb30", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ba2febe5f04f44b687d6e675450cecb0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/100 [00:00 1.96:\n", + " all_pairs.append((c, clu, matrix.idx2word[i], lor[i]))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "17865f47", + "metadata": {}, + "outputs": [], + "source": [ + "significant = pd.DataFrame(all_pairs, columns=[\"subreddit\", \"cluster\", \"template\", \"zscore\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0bf40559", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is significant variation in 94 semantic clusters\n" + ] + } + ], + "source": [ + "print(f\"There is significant variation in {len(significant.cluster.unique())} semantic clusters\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8be79a5d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This spans 391 templates and 26 subreddits\n" + ] + } + ], + "source": [ + "print(f\"This spans {len(significant.template.unique())} templates and {len(significant.subreddit.unique())} subreddits\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "dd44e366", + "metadata": {}, + "source": [ + "Some variables with notable variation:\n", + "\n", + "- Cluster 1 is the compare2 cluster\n", + "- Cluster 0 is the declarative cluster\n", + "- Cluster 139 is a \"said displeasing thing\" cluster\n", + "- Cluster 30 is a surprise narrative cluster\n", + "- Cluster 15 is a scalar cluster\n", + "- 14 is concern\n", + "- 19 is looming threat?\n", + "\n", + "- Cluster 39 is self-satisfied" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "118141cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.7583405446933598" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "significant.zscore.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "700bbfcb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
templatezscore
subreddit
Animemes20956.068878
HistoryMemes1082.089408
dankmemes606.223277
memes164.731957
memesITA2532.345918
\n", + "
" + ], + "text/plain": [ + " template zscore\n", + "subreddit \n", + "Animemes 2095 6.068878\n", + "HistoryMemes 108 2.089408\n", + "dankmemes 60 6.223277\n", + "memes 16 4.731957\n", + "memesITA 253 2.345918" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "significant[significant.cluster.eq(\"30\")].groupby(\"subreddit\").agg({\"template\": \"first\", \"zscore\": \"first\"})" + ] + }, + { + "cell_type": "markdown", + "id": "8be57c8a", + "metadata": {}, + "source": [ + "# Innovation" + ] + }, + { + "cell_type": "markdown", + "id": "1d8a1950", + "metadata": {}, + "source": [ + "## Entropy" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e61774e5", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import entropy\n", + "from scipy.spatial.distance import jensenshannon" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8e16edbe", + "metadata": {}, + "outputs": [], + "source": [ + "def get_entropy_values_for_cluster(semantic_cluster):\n", + " \"\"\"\n", + " Return size-k list of entropy values where\n", + " \n", + " k = the number of years the cluster has had meme instances\n", + " ith index = the entropy value i years after the first instance\n", + " \"\"\"\n", + " templates = template_clusters.cluster_to_tpl[semantic_cluster]\n", + " instances = get_instances(templates)\n", + " df = pd.DataFrame(instances)\n", + "\n", + " df.loc[:, \"timestamp\"] = pd.to_datetime(df.utc, unit=\"s\")\n", + " \n", + " # get year by num years since first date\n", + " df.loc[:, \"year\"] = ((df.timestamp - df.timestamp.min()).dt.days / 365.2425).astype(int)\n", + "\n", + " df = df[df.year != df.year.max]\n", + "\n", + " counts_by_year_by_tpl = df.groupby([\"year\", df.template]).post_id.count().reset_index()\n", + " counts_by_year_by_tpl[\"proportion\"] = counts_by_year_by_tpl.groupby(\"year\").post_id.transform(lambda x: x / x.sum())\n", + " \n", + " sample_size = counts_by_year_by_tpl.groupby(\"year\").post_id.sum().min()\n", + " if sample_size < 30:\n", + " return None, None\n", + " distributions = counts_by_year_by_tpl.pivot(index=\"year\", columns=\"template\", values=\"proportion\").fillna(0)\n", + "\n", + " years = distributions.index\n", + " min_years = 5\n", + " if len(years) < min_years:\n", + " return None, None\n", + " rng = np.random.default_rng(seed=0xb1ab)\n", + " resampled = pd.DataFrame([\n", + " rng.multinomial(sample_size, row)\n", + " for row in distributions.to_numpy()\n", + " ],\n", + " index=years,\n", + " columns=distributions.columns\n", + " )\n", + " max_years = 8\n", + " entropies = entropy(resampled.to_numpy(), axis=1)\n", + " \n", + " data = pd.DataFrame({\n", + " \"entropy\": entropies[:max_years],\n", + " \"year\": years[:max_years],\n", + " \"cluster\": [semantic_cluster] * min(len(years), max_years)\n", + " })\n", + "# return resampled\n", + "# resampled[\"proportion\"] = resampled.groupby(\"year\").value.transform(lambda x: x / x.sum())\n", + " \n", + "# data[\"cluster\"] = semantic_cluster\n", + "# data[\"timestamp\"] = data[\"timestamp\"] - data[\"timestamp\"].min()\n", + " return data, resampled" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ed017e0c", + "metadata": {}, + "outputs": [], + "source": [ + "def draw_entropy():\n", + " topk = list(template_clusters.cluster_to_tpl.keys())\n", + " values = [get_entropy_values_for_cluster(clu)[0] for clu in (topk)]\n", + "# resampled = [get_entropy_values_for_cluster(clu)[1] for clu in tqdm(topk)]\n", + " values = [value for value in values if value is not None]\n", + " entropies = pd.concat(values)\n", + " print(f\"We have coverage over {len(df[df.cluster.isin(entropies.cluster.unique())])} posts\")\n", + " print(f\"and {len(values)} clusters\")\n", + " sns.pointplot(entropies, x=\"year\", y=\"entropy\")\n", + " plt.ylabel(\"Entropy\")\n", + " plt.xlabel(\"Years after introduction\")\n", + " plt.savefig(\"../figures/entropy.pdf\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6486c878", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have coverage over 958924 posts\n", + "and 146 clusters\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "draw_entropy()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5eb786d7", + "metadata": {}, + "outputs": [], + "source": [ + "def get_js_dists(semantic_cluster):\n", + " \"\"\"\n", + " Return size-k list of entropy values where\n", + " \n", + " k = the number of years the cluster has had meme instances\n", + " ith index = the entropy value i years after the first instance\n", + " \"\"\"\n", + " templates = template_clusters.cluster_to_tpl[semantic_cluster]\n", + " instances = get_instances(templates)\n", + " df = pd.DataFrame(instances)\n", + "\n", + " df.loc[:, \"timestamp\"] = pd.to_datetime(df.utc, unit=\"s\")\n", + "# df.loc[:, \"year\"] = df.timestamp.dt.year\n", + " df.loc[:, \"year\"] = ((df.timestamp - df.timestamp.min()).dt.days / 365.2425).astype(int)\n", + " df = df[df.year != df.year.max]\n", + "\n", + " counts_by_year_by_tpl = df.groupby([df.year, df.template]).post_id.count().reset_index()\n", + " counts_by_year_by_tpl[\"proportion\"] = counts_by_year_by_tpl.groupby(\"year\").post_id.transform(lambda x: x / x.sum())\n", + " post_dist = counts_by_year_by_tpl.pivot(index=\"year\", columns=\"template\", values=\"proportion\").fillna(0)\n", + " pre_dist = post_dist.shift()\n", + "\n", + " sample_size = counts_by_year_by_tpl.groupby(\"year\").post_id.sum().min()\n", + " if sample_size < 30:\n", + " return None\n", + "\n", + " data = []\n", + " \n", + " for (year, pre), (_, post) in zip(pre_dist.iterrows(), post_dist.iterrows()):\n", + " dist = jensenshannon(pre, post)\n", + " data.append({\n", + " \"js_div\": dist,\n", + " \"year\": year,\n", + " \"cluster\": semantic_cluster\n", + " })\n", + " data = pd.DataFrame(data[1:])\n", + "# data[\"year\"] = data[\"year\"] - data[\"year\"].min()\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5ef71df1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "577d54fa1403440aa5d71accd0c0cada", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/784 [00:00" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def jsdiv():\n", + " topk = list(template_clusters.cluster_to_tpl.keys())#[:100]\n", + " values = [get_js_dists(clu) for clu in tqdm(topk)]\n", + " values = [value for value in values if value is not None]\n", + " dists = pd.concat(values)\n", + "\n", + " sns.lineplot(dists, x=\"year\", y=\"js_div\")\n", + " plt.ylabel(\"JS Divergence\")\n", + " plt.xlabel(\"Years after introduction\")\n", + "jsdiv()" + ] + }, + { + "cell_type": "markdown", + "id": "9fbf022b", + "metadata": {}, + "source": [ + "## Diffusion" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "aca574cf", + "metadata": {}, + "outputs": [], + "source": [ + "def pmi(matrix):\n", + " num_x, num_y = matrix.shape\n", + " total = matrix.sum()\n", + " p_xy = matrix / total\n", + " p_x = matrix.sum(axis=1, keepdims=True) / total\n", + " p_y = matrix.sum(axis=0, keepdims=True) / total\n", + " pmi = np.log2(p_xy) - np.log2(p_x * p_y)\n", + " \n", + " return pmi\n", + "\n", + "\n", + "def npmi(matrix):\n", + " h_xy = -np.log2(matrix / matrix.sum())\n", + " res = pmi(matrix) / h_xy\n", + " return np.where(np.isnan(res), -1, res)\n", + " \n", + "def ppmi(matrix):\n", + " return np.maximum(pmi(matrix), 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "731da727", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2facdbd6aade4a3cb36e084fdd285fc7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/784 [00:00:7: RuntimeWarning: divide by zero encountered in log2\n", + " pmi = np.log2(p_xy) - np.log2(p_x * p_y)\n" + ] + } + ], + "source": [ + "pmi_dict = {}\n", + "for cluster in tqdm(template_clusters.cluster_to_tpl.keys()):\n", + " matrix = create_doc_term_mat(df[df.cluster.eq(cluster)], \"subreddit\", \"template\")\n", + " \n", + " a = ppmi(matrix.mat)\n", + "\n", + " for c, t in zip(*np.unravel_index(range(matrix.mat.size), matrix.mat.shape)):\n", + " pmi_dict[(matrix.idx2word[t], matrix.idx2doc[c])] = a[c, t]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1f5202bf", + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[:, \"pmi\"] = df[[\"template\", \"subreddit\"]].apply(lambda x: pmi_dict[(x.template, x.subreddit)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "a21654f7", + "metadata": {}, + "outputs": [], + "source": [ + "def first_subs_for_templates_in_cluster(semantic_cluster):\n", + " templates = template_clusters.cluster_to_tpl[semantic_cluster]\n", + " instances = get_instances(templates)\n", + " df = pd.DataFrame(instances)\n", + " \n", + " return df.sort_values(by=\"utc\").groupby(\"template\").agg({\"subreddit\": \"first\"})\n", + "\n", + "def topk_proportions(x):\n", + " counts = x.head(100).value_counts()\n", + " counts /= counts.sum()\n", + " return counts.iloc[0]\n", + "\n", + "def top_sub(x):\n", + " counts = x.head(100).value_counts()\n", + " counts /= counts.sum()\n", + " return counts.index[0]\n", + "\n", + "def origin_analysis():\n", + " # calculate the proportion of each subreddit among the starters\n", + " starter_proportions = df.sort_values(\"utc\").groupby(\"template\").agg(\n", + " proportion = (\"subreddit\", topk_proportions),\n", + " subreddit = (\"subreddit\", top_sub)\n", + " )\n", + "\n", + " # bin proportions for later use\n", + " starter_proportions[\"binned_proportion\"] = starter_proportions.proportion * 100 // 10\n", + "\n", + " # filter for at least 200, to do before/after\n", + " enough_posts = [tpl for tpl, ct in df.template.value_counts().to_dict().items() if ct > 200]\n", + " ending = df[df.template.isin(set(enough_posts))]\n", + "\n", + " # construct list of afters for each template\n", + " tmp_dfs = []\n", + " for cluster, inds in tqdm(ending.groupby(\"template\").groups.items()):\n", + " tmp_dfs.append(\n", + " ending.loc[inds].sort_values(\"utc\", ascending=True).iloc[100:]\n", + " )\n", + " ending_df = pd.concat(tmp_dfs, axis=0)\n", + "\n", + " # calculate after PMI\n", + " ending_pmi_dict = {}\n", + " for cluster in tqdm(ending_df.cluster.unique()):\n", + " matrix = create_doc_term_mat(ending_df[ending_df.cluster.eq(cluster)], \"subreddit\", \"template\")\n", + "\n", + " a = ppmi(matrix.mat)\n", + "\n", + " if matrix.mat.size == 0:\n", + " # print(cluster)\n", + " continue\n", + "\n", + " for c, t in zip(*np.unravel_index(np.array(range(matrix.mat.size)), matrix.mat.shape)):\n", + " ending_pmi_dict[(matrix.idx2word[t], matrix.idx2doc[c])] = a[c, t]\n", + "\n", + " ending_df.loc[:, \"pmi\"] = ending_df[[\"template\", \"subreddit\"]].apply(lambda x: ending_pmi_dict.get((x.template, x.subreddit), None), axis=1)\n", + "\n", + "\n", + " starter_subs = starter_proportions.to_dict()[\"subreddit\"]\n", + " mask = ending_df.subreddit.ne(df.template.apply(starter_subs.get))\n", + "\n", + " final_pmis = pd.DataFrame(\n", + " dict(pmi=starter_proportions.reset_index()[[\"template\", \"subreddit\"]].apply(lambda x: ending_pmi_dict.get((x.template, x.subreddit), None), axis=1),\n", + " template=starter_proportions.index)\n", + " ).set_index(\"template\").pmi\n", + "\n", + "\n", + " # final_control is the avg PMI of non-starters\n", + " # final_pmi is the PMI of the starter sub for each\n", + "\n", + " starter_proportions.loc[:, \"final_control\"] = ending_df[mask].groupby(\"template\").pmi.agg(\"mean\")\n", + " starter_proportions.loc[:, \"final_pmi\"] = final_pmis\n", + "\n", + " return starter_proportions" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "10a0d7d3", + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "\n", + "def linregplot(starter_proportions):\n", + " data = starter_proportions\n", + " data = data[data.proportion.between(0.5, 1, inclusive=\"left\")]\n", + " sns.regplot(\n", + " data=data,\n", + " x=\"proportion\",\n", + " y=data.final_pmi - data.final_control,\n", + " )\n", + " plt.ylabel(\"Origin PMI - Control PMI\")\n", + " plt.xlabel(\"Proportion of Seed Posts by Innovator Subreddit\")\n", + "\n", + "def linregplotb(starter_proportions):\n", + " data = starter_proportions\n", + " data = data[data.proportion.between(0.5, 1, inclusive=\"left\")]\n", + " sns.regplot(\n", + " data=data,\n", + " x=\"proportion\",\n", + " y=data.final_pmi,\n", + " )\n", + " plt.ylabel(\"Origin PMI\")\n", + " plt.xlabel(\"Proportion of Seed Posts by Innovator Subreddit\")\n", + " plt.savefig(\"../figures/div_linregress.pdf\")\n", + " \n", + "def compareplot(starter_proportions):\n", + " data = starter_proportions.dropna(subset=[\"final_pmi\", \"final_control\"])\n", + " data = data[data.proportion.between(0.5, 1, inclusive=\"left\")]\n", + " data = data.melt(\n", + " id_vars=[\"binned_proportion\", \"subreddit\"],\n", + " value_vars=[\"final_pmi\", \"final_control\"],\n", + " var_name=\"Group\",\n", + " value_name=\"PPMI\"\n", + " )\n", + " sns.pointplot(\n", + " data=data.assign(\n", + " Group=data[\"Group\"].map({\"final_pmi\": \"Origin subreddit\", \"final_control\": \"Other subreddits\"})\n", + " ),\n", + " x=data.binned_proportion / 10,\n", + " y=\"PPMI\",\n", + " hue=\"Group\"\n", + " )\n", + " plt.xlabel(\"% of first 100 posts in origin subreddit\")\n", + " plt.savefig(\"../figures/divergence.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "89547d3d", + "metadata": {}, + "outputs": [], + "source": [ + "def regress(starter_proportions):\n", + " # do paired ttest\n", + " data = starter_proportions.dropna(subset=[\"final_pmi\", \"final_control\"])\n", + " data = data[data.proportion.between(0.5, 1, inclusive=\"left\")]\n", + "# results = stats.ttest_rel(data.final_pmi, data.final_control)\n", + " print(\"Paired t-test for each cluster\")\n", + " print(stats.linregress(data.proportion, data.final_pmi - data.final_control))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "bc8290ea", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d36866e0c65d4d8bada2a3ac8b3d26e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3683 [00:00:7: RuntimeWarning: divide by zero encountered in log2\n", + " pmi = np.log2(p_xy) - np.log2(p_x * p_y)\n", + ":69: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", + " starter_proportions.loc[:, \"final_control\"] = ending_df[mask].groupby(\"template\").pmi.agg(\"mean\")\n" + ] + } + ], + "source": [ + "starter_proportions = origin_analysis()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "ce1c7009", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Paired t-test for each cluster\n", + "LinregressResult(slope=3.8349818189385787, intercept=-2.354880105716605, rvalue=0.5046915037221197, pvalue=1.1844118404801556e-59, stderr=0.21830066757602912, intercept_stderr=0.13621971707415811)\n" + ] + } + ], + "source": [ + "regress(starter_proportions)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e4cfdb48", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "linregplot(starter_proportions)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "868b7cac", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "compareplot(starter_proportions)" + ] + }, + { + "cell_type": "markdown", + "id": "a3e2d932", + "metadata": {}, + "source": [ + "# Acculturation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d660334b", + "metadata": {}, + "outputs": [], + "source": [ + "tmp = df.sort_values(\"utc\").groupby([\"subreddit\", \"user\"])\n", + "df[\"comment_timedelta\"] = tmp.utc.transform(lambda x: x - x.iloc[0])\n", + "df.comment_timedelta = pd.to_timedelta(df.comment_timedelta, unit=\"s\")\n", + "\n", + "a = df[\n", + " df.user.ne(\"[deleted]\")\n", + " & ~df.user.str.lower().str.contains(\"bot\")\n", + " & df.comment_timedelta.dt.days.lt(12 * 30)\n", + "]\n", + "\n", + "user_counts = a.groupby([\"subreddit\", \"user\"]).post_id.count().reset_index()\n", + "keep_user_subs = {(x.subreddit, x.user) for x in user_counts[user_counts.post_id.gt(10)][[\"subreddit\", \"user\"]].itertuples()}\n", + "mask = a[[\"subreddit\", \"user\"]].apply(lambda x: (x.subreddit, x.user) in keep_user_subs, axis=1)\n", + "grouped = a[mask].groupby([\"subreddit\", \"user\", a[mask].comment_timedelta.dt.days // 30]).pmi.mean().reset_index()\n", + "\n", + "keep_groups = grouped.subreddit.value_counts()[grouped.subreddit.value_counts() >= 30].index\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d36ac260", + "metadata": {}, + "outputs": [], + "source": [ + "len(g.user.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67edd139", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(0xb1ab)\n", + "g = grouped[grouped.subreddit.isin(keep_groups)].groupby('subreddit', group_keys=False).apply(lambda x: x.sample(min(len(x), 100), replace=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a12c188f", + "metadata": {}, + "outputs": [], + "source": [ + "sns.pointplot(x=g.comment_timedelta, y=g.pmi)\n", + "\n", + "plt.xlabel(\"Months since first comment\")\n", + "plt.ylabel(\"PPMI\")\n", + "plt.savefig(\"../figures/acculturation.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "443b4154", + "metadata": {}, + "outputs": [], + "source": [ + "print(stats.pearsonr(g.comment_timedelta, g.pmi))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd8e33ad", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/memes/data/README.md b/memes/data/README.md new file mode 100644 index 0000000..85bf1ed --- /dev/null +++ b/memes/data/README.md @@ -0,0 +1,3 @@ +# Dataset + +Currently, the images are not available for download. However, semantic\_clusters\_roberta contains IDs of Reddit posts, the image URLs, and the cluster assignments, which you can use to reconstruct the data for all of the experiments.