RegexΒΆ

[1]:
import pandas as pd
import numpy as np
import re
[2]:
aita_url = "https://raw.githubusercontent.com/roualdes/data/master/aita_clean_really_lightweight.csv"
df = pd.read_csv(aita_url)
df["body"].fillna("", inplace = True)
[3]:
# with pd.read_csv(aita_url, chunksize=1) as rdr:
#     for chunk in rdr:
#         print(chunk["id"])
[4]:
df["id"].str.count(r'^b.*q$').sum()
[4]:
12
[5]:
idxb = df["id"].str.match(r"^b.*")
df["id"].loc[idxb]
[5]:
2      bajsje
4      bz4m2k
7      bvxwjj
8      bn5hw7
22     bpszpy
        ...
993    bded2i
995    bpr3fg
996    b5gcz9
998    bkhy6s
999    bnkaf7
Name: id, Length: 324, dtype: object
[6]:
idxq = df["id"].str.match(r".*q$")
df["id"].loc[idxq]
[6]:
95     devpkq
117    bk9abq
137    b5ugbq
142    chjtyq
149    dnqemq
199    dc0nlq
207    7xtfbq
209    b7aqeq
210    blegvq
294    cv2fmq
304    dri7xq
311    cx19xq
319    cfxu9q
334    cs1soq
353    bu0bbq
364    95yglq
372    ch5h3q
380    c6drhq
400    coqrxq
432    c24eeq
434    cl8nqq
445    86v4kq
446    bdljxq
487    dgqx3q
500    clr14q
506    5m7e4q
515    cumqkq
630    b7rl0q
633    7y29kq
648    bu80iq
662    bo863q
664    b9c0tq
687    dazejq
752    dqe0pq
791    bvjiuq
867    axbe8q
882    d801yq
977    btdv5q
Name: id, dtype: object
[7]:
idx = np.logical_or(idxb, idxq)
df["id"].loc[idx]
[7]:
2      bajsje
4      bz4m2k
7      bvxwjj
8      bn5hw7
22     bpszpy
        ...
993    bded2i
995    bpr3fg
996    b5gcz9
998    bkhy6s
999    bnkaf7
Name: id, Length: 350, dtype: object
[8]:
p = np.mean(df['is_asshole'])
data = {"Label": ["A-hole", "Not A-hole"], "Proportion": [p, 1-p]}
pd.DataFrame(data).round(2)
[8]:
Label Proportion
0 A-hole 0.24
1 Not A-hole 0.76
[4]:
jdx = df["body"].str.match(r'.*\b([W|w]ife|[G|g]irlfriend)\b.*')
[10]:
df["body"].loc[np.where(jdx == 1)[0][3]]
[10]:
"So I (27M) went for a drink with some friends (2 guys my age and also 2 girls). Just for a preliminary note, I have a girlfriend (26F) and so do the two guys. However our girlfriends weren't with us.\n\nNow anyway, as we were drinking one of the guy friends, we'll call Steve, said that he thinks he is ''in love'' with his girlfriend and that there's ''noone else in the world that he loves more than her''. Now we can call the other guy Joe and the other two girls Bethany and Katy. Joe congratulated him and Bethany and Katy said ''awwwh''. I congratulated him too that he's found a girl he really likes.\n\nThen they turned to me and asked whether I love my girlfriend. I said that I do, but I don't know why I said this, as I was drunk, I just blurted out ''I love her very much, I really do, but...there's noone in the whole world that I love more than myself. Not even my own mother.''\n\nThey looked at me startled and the girls said ''That's such a fucking douchey thing to say''. I quickly changed the topic, but when I think of it, I don't see the issue. It's socially acceptable for a guy to say he loves his girlfriend/wife more than anyone, or it's socially acceptable for a mother to say she loves her son more than anyone (or a father loves his daughter more than anyone). But it's douchey to say I love myself more than anyone?\n\nI 100% feel that I love myself more than anyone. I love myself more than my parents. More than my friends. More than my girlfriend. I don't see why people get triggered at this, AITA?"
[25]:
adx = df["is_asshole"] == 1
np.round(np.sum(np.logical_and(adx, jdx)) / np.sum(jdx), 2)
[25]:
0.28