# In[1]:


import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# In[2]:


dsp = pd.read_csv(r"<path-to-dsp-csv>", delimiter="\t")
dsp.sort_values(by="Upload Date", inplace=True)


# In[3]:


dsp["file_size_str"] = dsp["File Size (when available)"]
dsp["fzs_chars"] = dsp.file_size_str.map(lambda s: re.sub(r"[0-9.\s]", "", s))
dsp.fzs_chars.value_counts()


# In[4]:


def clean_fzs(string):
    string = string.lower().strip()
    if "kb" in string:
        return float(string.replace(" kb", ""))
    elif "mb" in string:
        return float(string.replace(" mb", "")) * (10**3)
    elif "gb" in string:
        return float(string.replace(" gb", "")) * (10**6)
    else:
        return np.nan


# In[5]:


dsp["file_size_kb"] = dsp["file_size_str"].map(clean_fzs)
dsp["file_size_mb"] = dsp["file_size_kb"] / (10**3)
dsp.file_size_mb.describe()


# In[6]:


(dsp["Upload Date"].min(), dsp["Upload Date"].max())


# In[7]:


dsp["upload_year"] = dsp["Upload Date"].map(lambda s: s.strip()[:4])
dsp.upload_year.value_counts().sort_index()


# In[8]:


dsp["file_size_mb_ff"] = dsp["file_size_mb"].ffill()
dsp["file_size_mb_imp"] = dsp["file_size_mb"].fillna(dsp.groupby("upload_year")["file_size_mb"].transform("mean"))
print(dsp.file_size_mb_imp.describe())
print(dsp.file_size_mb_ff.describe())


# In[9]:


(dsp.file_size_mb.sum() / 10**6, dsp.file_size_mb_imp.sum() / 10**6, dsp.file_size_mb_ff.sum() / 10**6)


# In[10]:


dsp_fs_year = dsp.groupby("upload_year")["file_size_mb_ff"].sum().reset_index()
dsp_fs_year["file_size_gb_ff"] = dsp_fs_year["file_size_mb_ff"] / 10**6
dsp_fs_year["total_fs_gb_ff"] = dsp_fs_year["file_size_gb_ff"].cumsum()


# In[11]:


dsp_fs_year.plot("upload_year", "file_size_gb_ff", kind="line", legend=False)
plt.xlabel("Year")
plt.ylabel("Total Uploaded (TB)")
plt.title("DSP Total Video Upload per Year")


# In[12]:


dsp_fs_year.plot("upload_year", "total_fs_gb_ff", kind="line", legend=False)
plt.xlabel("Year")
plt.ylabel("Video Uploaded (TB)")
plt.title("DSP Cumulative YT Uploads")


# In[13]:


def std_vl(vid_len_str):
    n_colon = len(re.sub(r"\d", "", vid_len_str.strip()))
    if n_colon == 1:
        return "00:" + vid_len_str
    else:
        return vid_len_str


# In[14]:


dsp["video_len_str"] = dsp["Video Length"]
dsp["video_len_delta"] = pd.to_timedelta(dsp["video_len_str"].map(std_vl))
dsp["video_len_sec"] = dsp["video_len_delta"].dt.total_seconds()
dsp.video_len_sec.describe()


# In[15]:


total_secs = dsp.video_len_sec.sum()
print(f"Secs: {total_secs}")
print(f"Hours: {total_secs / 3600}")
print(f"Days: {(total_secs / 3600) / 24}")


# In[16]:


dsp["video_len_hrs"] = dsp["video_len_sec"] / 3600
dsp_vl_year = dsp.groupby("upload_year")["video_len_hrs"].sum().reset_index()
dsp_vl_year["total_video_hrs"] = dsp_vl_year["video_len_hrs"].cumsum()
dsp_vl_year


# In[17]:


dsp_vl_year.plot("upload_year", "video_len_hrs", kind="line", legend=False)
plt.xlabel("Year")
plt.ylabel("Hours")
plt.title("DSP Hours of Video Uploaded to YT per Year")


# In[18]:


dsp_vl_year.plot("upload_year", "total_video_hrs", kind="line", legend=False)
plt.xlabel("Year")
plt.ylabel("Hours")
plt.title("Hours of Video Being Stored by YT for DSP")

