import requests
from bs4 import BeautifulSoup
import re
import sqlite3
import language_check
import sqlite3
import pandas as pd
import random
import time

tool = language_check.LanguageTool('en-US')
conn = sqlite3.connect('./works.sqlite')

t0 = time.perf_counter()
seconds_per_request = 1.1

def retrievePage(url, pbar=None, it=0):
    elapsed = time.perf_counter() - globals()['t0']
    need_to_wait = seconds_per_request - elapsed # limit requests to 1 per second
    if need_to_wait > 0:
        time.sleep(need_to_wait)
    result = requests.get(url)
    globals()['t0'] = time.perf_counter()
    if result.status_code == 429:
        if pbar:
            pbar.set_description('429 error. Paused for ' + str(it*10) + ' seconds.')
        time.sleep(10)
        return retrievePage(url, pbar, it+1)
    else:
        return result

def getArchiveListingURL(page):
    URL = 'https://archiveofourown.org/works?utf8=%E2%9C%93&work_search%5Bsort_column%5D=authors_to_sort_on&work_search%5Bother_tag_names%5D=&exclude_work_search%5Brating_ids%5D%5B%5D=13&exclude_work_search%5Barchive_warning_ids%5D%5B%5D=19&exclude_work_search%5Barchive_warning_ids%5D%5B%5D=20&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=en&commit=Sort+and+Filter&tag_id=Marvel'
    URL = URL + '&page=' + str(page)
    return URL

def processPage(page, maxToProcess=None, pbar=None, printWork=False):
    URL = getArchiveListingURL(page)
    data = pd.DataFrame(columns=['title', 'date', 'words', 'kudos', 'comments', 'hits', 'bookmarks', 'chapters', \
                                 'relationships', 'characters', 'freeforms', 'id', 'ch1_words', 'ch1_unique_words_in_first_1000', 'ch1_mistakes', 'ch1_mistakes_per_word'])
    page = retrievePage(URL, pbar)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    ol = soup.find_all('ol', class_='work')[0]
    i = 0
    for child in ol.findChildren(recursive=False):
        if(maxToProcess != None and i >= maxToProcess):
            return data
        i = i + 1
        work = {}
        work['title'] = child.find_all('a')[0].text
        
        pbar.set_description(work['title'])
        if printWork:
            print('Processing work: ' + work['title'])
        
        try:
            work['words'] = int(child.find_all('dd', class_='words')[0].text.replace(',',''))
        except:
            work['words'] = 0
        
        try:
            work['kudos'] = int(child.find_all('dd', class_='kudos')[0].text)
        except:
            work['kudos'] = 0

        try:
            work['comments'] = int(child.find_all('dd', class_='comments')[0].text)
        except:
            work['comments'] = 0

        work['hits'] = int(child.find_all('dd', class_='hits')[0].text)

        try:
            work['bookmarks'] = int(child.find_all('dd', class_='bookmarks')[0].text)
        except:
            work['bookmarks'] = 0

        try:
            work['chapters'] = int(re.search(r'\d+', child.find_all('dd', class_='chapters')[0].text.replace(',','')).group())
        except:
            work['chapters'] = None

        work['date'] = pd.to_datetime(child.find_all('p', class_='datetime')[0].text)
        work['relationships'] = len(child.find_all('li', class_='relationships'))
        work['characters'] = len(child.find_all('li', class_='characters'))
        work['freeforms'] = len(child.find_all('li', class_='freeforms'))
        work_id = child.find_all('a')[0]['href']
        work['id'] = re.search(r'\d+', work_id).group()
        work_url = 'https://archiveofourown.org/works/' + work['id']
        
        if printWork:
            print(work_url)


        ch1_page = retrievePage(work_url)
        ch1_soup = BeautifulSoup(ch1_page.content, 'html.parser')

        ch1_text = ch1_soup.find('div', class_='userstuff').get_text(' ') \
            .replace('\n Chapter Text \n ', '').replace('\xa0', '').replace('   ', ' ').replace('  ', ' ').replace('\n', '').replace(' ,', '')
        matches = tool.check(ch1_text)
        
        ch1_splitted = ch1_text.split(' ')
        ch1_word_count = len(ch1_splitted)
        ch1_first1000 = ch1_splitted[:1000]
        work['ch1_unique_words_in_first_1000'] = len(set(ch1_first1000))
        
        work['ch1_mistakes'] = len(matches)
        work['ch1_words'] = len(ch1_splitted)
        work['ch1_mistakes_per_word'] = work['ch1_mistakes'] / ch1_word_count

        data = data.append(work, ignore_index=True)
    return data.set_index('id')

#pages = list(range(1, 5001))
#random.shuffle(pages)

# processPage(pages[0], 2)


pd.DataFrame().to_sql('works', conn, if_exists='replace')


current_page = 0


from tqdm import tqdm

pages_to_process = 5000
pbar = tqdm(pages[current_page:])
first = True

for page in pbar:
    df = processPage(page, None, pbar)
    replace_option = 'append'
    first = False
    df.to_sql('works', conn, if_exists=replace_option, index=False)
    current_page = current_page + 1


df = pd.read_sql('select * from works', conn).drop_duplicates('title').drop(columns=['id', 'index'])
df['dateint'] = pd.to_datetime(df['date']).dt.strftime("%Y%m%d").astype(int)
df


import matplotlib.pyplot as plt
import numpy as np

columns_to_create_histograms_for = ['words', 'kudos', 'comments', 'hits', 'bookmarks', 'chapters', 'relationships', 'characters', 'freeforms', \
                                    'ch1_words', 'ch1_unique_words_in_first_1000', 'ch1_mistakes', 'dateint']
plot_log = [True, True, True, True, True, True, True, True, True, True, False, True, False]

def plot_loghist(x, bins):
    hist, bins = np.histogram(x, bins=bins)
    logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins))
    fig = plt.hist(x, bins=logbins)
    plt.xscale('log')
    return fig

i = 0
for col in columns_to_create_histograms_for:
    if plot_log[i]:
        if col == 'ch1_unique_words_in_first_1000':
            # ensure we only plot unique words per first 1000 in works that have >= words.
            plot_loghist(df[df[col] > 0 & df['words'] >= 1000][col], 10)
        else:
            plot_loghist(df[df[col] > 0][col], 10)
    else:
        plt.hist(df[col])
    plt.xlabel(col, fontsize=18)
    plt.ylabel('Count', fontsize=16)
    plt.title('Histogram of ' + col)
    plt.show()
    i = i + 1


import seaborn as sn
plt.rcParams['figure.figsize'] = [16, 8]
corrMatrix = df.drop(columns=['date', 'title', 'ch1_mistakes_per_word']).corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import all_estimators
from sklearn.model_selection import cross_val_score

estimators = all_estimators(type_filter='regressor')
all_regs = []
for name, RegressorClass in estimators:
    try:
        reg = RegressorClass()
        all_regs.append(reg)
    except Exception as e:
        print(e)

        
df2 = df[df['kudos'] > 0]

X = df2[['words', 'chapters', 'relationships', 'characters', 'freeforms', 'ch1_words', 'ch1_mistakes']]
y = df2['kudos']

rows = []

for regressor in all_regs:
    steps = [
        ('scalar', StandardScaler()),
        ('model', regressor)
    ]

    pipeline = Pipeline(steps)

    try:
        # 5-fold cross validation
        scores = cross_val_score(pipeline, X, y)
        row = {
            'algorithm': regressor,
            'r2': scores.mean()
        }
        rows.append(row)
    except:
        pass

algos_df = pd.DataFrame(rows)
algos_df = algos_df.sort_values(by=['r2'], ascending=False)


algos_df


from sklearn.model_selection import train_test_split
from sklearn.linear_model import GammaRegressor
from yellowbrick.regressor import ResidualsPlot

# Split the data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# algos_df
for _, row in algos_df.head().iterrows():
    steps = [
        ('scalar', StandardScaler()),
        ('model', row.regressor)
    ]

    pipeline = Pipeline(steps)
    
    model = pipeline.fit(X_train, y_train)
    print(model.score(X_test, y_test))

    visualizer = ResidualsPlot(model)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.show()

0.06147542808529849

-0.0001015459750446368

-0.00042804371923099716

-0.00042804371923099716

0.0041748373331083055


from sklearn.linear_model import LinearRegression

def predict_var(col_to_predict, display_coefs=True):
    X = df.drop(columns=[col_to_predict, 'title', 'date'])
    y = df[col_to_predict]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    steps = [
        ('scalar', StandardScaler()),
        ('model', LinearRegression())
    ]

    pipeline = Pipeline(steps)

    model = pipeline.fit(X_train, y_train)

    visualizer = ResidualsPlot(model)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.title = 'Residuals for LinearRegression Model predicting ' + col_to_predict
    visualizer.show()
    
    coefs = data=model.named_steps['model'].coef_
    if display_coefs:
        print('Coefficients:')
        coef_df = pd.DataFrame(columns=X.columns, data=[coefs])
        display(coef_df)
        
predict_var('kudos')

Coefficients:


predict_var('comments')
predict_var('bookmarks')
predict_var('hits')

Coefficients:

Coefficients:

Coefficients:

	title	date	words	kudos	comments	hits	bookmarks	chapters	relationships	characters	freeforms	ch1_words	ch1_unique_words_in_first_1000	ch1_mistakes	ch1_mistakes_per_word	dateint
0	We've lost it all	2018-05-08 00:00:00	892	21	0	444	1	1	2	4	6	958	416	17	0.017745	20180508
1	I have realised	2018-07-31 00:00:00	2359	43	2	1341	10	1	1	3	4	2359	502	93	0.039423	20180731
4	Go be happy now	2020-03-25 00:00:00	3708	29	4	310	5	1	2	17	8	3703	490	68	0.018363	20200325
5	Someone I can't leave behind	2020-05-13 00:00:00	31727	497	111	6661	90	8	1	6	12	5547	483	241	0.043447	20200513
6	Now... what?	2018-08-08 00:00:00	2093	40	6	559	5	1	1	2	5	2090	505	66	0.031579	20180808
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4316	Prince of Jötunheim	2017-11-27 00:00:00	261	135	15	3006	15	1	1	6	2	257	176	50	0.194553	20171127
4317	Loki does what he wants 2 electric boogaloo	2018-01-17 00:00:00	372	109	2	1172	4	1	0	3	2	368	217	26	0.070652	20180117
4318	Valhalla I`m comming	2018-03-20 00:00:00	385	53	2	1956	3	1	2	4	3	383	222	24	0.062663	20180320
4319	Slipped Into Your arms	2020-06-26 00:00:00	4993	42	11	721	3	2	1	8	2	2154	448	291	0.135097	20200626
4320	I’ll betray all of you in the Hunger Games	2020-05-06 00:00:00	2324	15	5	257	1	1	0	13	4	2755	444	350	0.127042	20200506

	algorithm	r2	regressor
11	GammaRegressor()	6.889777e-02	GammaRegressor()
22	LassoCV()	3.942790e-02	LassoCV()
24	LassoLarsCV()	3.928245e-02	LassoLarsCV()
20	LarsCV()	3.928245e-02	LarsCV()
7	ElasticNet()	3.668541e-02	ElasticNet()
35	OrthogonalMatchingPursuitCV()	3.655060e-02	OrthogonalMatchingPursuitCV()
0	ARDRegression()	3.576431e-02	ARDRegression()
49	TweedieRegressor()	3.542952e-02	TweedieRegressor()
8	ElasticNetCV()	3.347327e-02	ElasticNetCV()
34	OrthogonalMatchingPursuit()	3.330073e-02	OrthogonalMatchingPursuit()
25	LassoLarsIC()	3.298475e-02	LassoLarsIC()
3	BayesianRidge()	3.274071e-02	BayesianRidge()
28	MLPRegressor()	3.236903e-02	MLPRegressor()
21	Lasso()	3.234020e-02	Lasso()
44	RidgeCV(alphas=array([ 0.1, 1. , 10. ]))	3.124464e-02	RidgeCV(alphas=array([ 0.1, 1. , 10. ]))
43	Ridge()	3.110735e-02	Ridge()
48	TransformedTargetRegressor()	3.109183e-02	TransformedTargetRegressor()
26	LinearRegression()	3.109183e-02	LinearRegression()
19	Lars()	3.109183e-02	Lars()
37	PLSRegression()	2.814591e-02	PLSRegression()
45	SGDRegressor()	2.144582e-02	SGDRegressor()
39	PoissonRegressor()	2.131622e-02	PoissonRegressor()
23	LassoLars()	1.798795e-02	LassoLars()
47	TheilSenRegressor(max_subpopulation=10000)	1.682637e-02	TheilSenRegressor(max_subpopulation=10000)
6	DummyRegressor()	-2.581159e-03	DummyRegressor()
15	HuberRegressor()	-2.147305e-02	HuberRegressor()
33	NuSVR()	-3.138892e-02	NuSVR()
27	LinearSVR()	-4.220683e-02	LinearSVR()
38	PassiveAggressiveRegressor()	-4.811679e-02	PassiveAggressiveRegressor()
46	SVR()	-5.670355e-02	SVR()
14	HistGradientBoostingRegressor()	-1.410493e-01	HistGradientBoostingRegressor()
42	RandomForestRegressor()	-1.666858e-01	RandomForestRegressor()
10	ExtraTreesRegressor()	-1.763655e-01	ExtraTreesRegressor()
18	KernelRidge()	-1.868473e-01	KernelRidge()
13	GradientBoostingRegressor()	-1.913863e-01	GradientBoostingRegressor()
17	KNeighborsRegressor()	-2.842021e-01	KNeighborsRegressor()
2	BaggingRegressor()	-3.402834e-01	BaggingRegressor()
4	CCA()	-1.186638e+00	CCA()
5	DecisionTreeRegressor()	-1.401808e+00	DecisionTreeRegressor()
9	ExtraTreeRegressor()	-1.761649e+00	ExtraTreeRegressor()
1	AdaBoostRegressor()	-2.492667e+00	AdaBoostRegressor()
36	PLSCanonical()	-2.692070e+00	PLSCanonical()
40	RANSACRegressor()	-1.546101e+02	RANSACRegressor()
12	GaussianProcessRegressor()	-2.720247e+05	GaussianProcessRegressor()
41	RadiusNeighborsRegressor()	-3.662865e+31	RadiusNeighborsRegressor()
16	IsotonicRegression()	NaN	IsotonicRegression()
29	MultiTaskElasticNet()	NaN	MultiTaskElasticNet()
30	MultiTaskElasticNetCV()	NaN	MultiTaskElasticNetCV()
31	MultiTaskLasso()	NaN	MultiTaskLasso()
32	MultiTaskLassoCV()	NaN	MultiTaskLassoCV()

What makes an ArchiveOfOurOwn story successful?¶

1. Introduction¶

Goal¶

2: Data Scraping¶

What kind of data are we scraping?¶

Columns of the rows in our database¶

Rate Limiting¶

SQLite¶

Resulting Dataframe¶

3. Exploratory Analysis and Data Visualization¶

Observations¶

Frequency Plots¶

Correlation Matrix¶

4. Analysis, Hypothesis Testing, and Machine Learning¶

Predicting Kudos using Author-Controlled variables¶

Using non-author controlled variables to predict kudos¶

Observations:¶

5. Insights¶

What could be improved to create a better model predicting kudos¶