feat(parles): Prototyping

import json
import operator
import os
from collections import defaultdict
from collections import defaultdict, Counter
import spacy
def are_invalid(lyrics: str):
......@@ -16,8 +19,7 @@ def are_invalid(lyrics: str):
def cleanup(lyrics):
lines = lyrics.split("\n")
filtered = [l for l in lines if
l is not None and
len(l) and
l is not None and len(l) and
l[0] not in ["#", "["] and
"Paroles de la chanson" not in l and
"Paroles de Même" not in l]
......@@ -27,33 +29,64 @@ def cleanup(lyrics):
def analyse():
path = "../data/"
files = os.listdir(path)
stats = {
"avg_lines": defaultdict(lambda: 0),
"avg_newlines": defaultdict(lambda: 0),
"missing": defaultdict(lambda: 0)
files = os.listdir(path)
print("Listed data files.")
nlp = spacy.load("fr_core_news_md")
print("Loaded spaCy.")
for filename in files:
with open(path + filename) as f:
content = json.load(f)
print("Loaded", filename)
artist = content["name"]
songs = content["songs"]
print("\n## %s ##" % artist)
keywords = []
occurences = {}
for song in songs:
# title = song["title"]
if song["lyrics"] is not None:
keyword = "anar"
occurences[song["title"]] = song["lyrics"].lower().count(keyword)
if any([i > 0 for i in occurences.values()]):
print("%s chez %s: %i" % (keyword, artist, len(occurences)))
for pair in sorted(occurences.items(), key=operator.itemgetter(1), reverse=True):
print("%s ne parle pas de %s" % (artist, keyword))
for song in songs:
# title = song["title"]
if song["lyrics"] is None:
stats["missing"][artist] += 1
lyrics: str = song["lyrics"]
if are_invalid(lyrics):
stats["missing"][artist] += 1
lyrics = cleanup(lyrics)
stats["avg_lines"][artist] += len(lyrics)
stats["avg_newlines"][artist] += lyrics.count("\n")
stats["missing"][artist] += 1
doc = nlp(lyrics)
keywords.extend([token.text for token in doc if
token.is_stop != True and
token.is_punct != True and
token.pos_ in ["PRON", "NOUN", "ADJ", "VERB", "INTJ", "X"]])
# five most common noun tokens
noun_freq = Counter(keywords)
common_nouns = noun_freq.most_common(20)
final_missing = stats["missing"][artist]
final_songs = len(songs) - final_missing
......@@ -66,6 +99,7 @@ def analyse():
if __name__ == '__main__':
