feat(parles): Prototyping

parent 16f01706
import json import json
import operator
import os import os
from collections import defaultdict from collections import defaultdict, Counter
import spacy
def are_invalid(lyrics: str): def are_invalid(lyrics: str):
...@@ -16,8 +19,7 @@ def are_invalid(lyrics: str): ...@@ -16,8 +19,7 @@ def are_invalid(lyrics: str):
def cleanup(lyrics): def cleanup(lyrics):
lines = lyrics.split("\n") lines = lyrics.split("\n")
filtered = [l for l in lines if filtered = [l for l in lines if
l is not None and l is not None and len(l) and
len(l) and
l[0] not in ["#", "["] and l[0] not in ["#", "["] and
"Paroles de la chanson" not in l and "Paroles de la chanson" not in l and
"Paroles de Même" not in l] "Paroles de Même" not in l]
...@@ -27,33 +29,64 @@ def cleanup(lyrics): ...@@ -27,33 +29,64 @@ def cleanup(lyrics):
def analyse(): def analyse():
path = "../data/" path = "../data/"
files = os.listdir(path)
stats = { stats = {
"avg_lines": defaultdict(lambda: 0), "avg_lines": defaultdict(lambda: 0),
"avg_newlines": defaultdict(lambda: 0), "avg_newlines": defaultdict(lambda: 0),
"missing": defaultdict(lambda: 0) "missing": defaultdict(lambda: 0)
} }
print("Go")
files = os.listdir(path)
print("Listed data files.")
nlp = spacy.load("fr_core_news_md")
print("Loaded spaCy.")
for filename in files: for filename in files:
with open(path + filename) as f: with open(path + filename) as f:
content = json.load(f) content = json.load(f)
print("Loaded", filename)
artist = content["name"] artist = content["name"]
songs = content["songs"] songs = content["songs"]
print("\n## %s ##" % artist) print("\n## %s ##" % artist)
keywords = []
occurences = {}
for song in songs: for song in songs:
# title = song["title"]
if song["lyrics"] is not None: if song["lyrics"] is not None:
keyword = "anar"
occurences[song["title"]] = song["lyrics"].lower().count(keyword)
if any([i > 0 for i in occurences.values()]):
print("%s chez %s: %i" % (keyword, artist, len(occurences)))
for pair in sorted(occurences.items(), key=operator.itemgetter(1), reverse=True):
print(pair)
else:
print("%s ne parle pas de %s" % (artist, keyword))
for song in songs:
# title = song["title"]
if song["lyrics"] is None:
stats["missing"][artist] += 1
else:
lyrics: str = song["lyrics"] lyrics: str = song["lyrics"]
if are_invalid(lyrics): if are_invalid(lyrics):
stats["missing"][artist] += 1 stats["missing"][artist] += 1
else: else:
lyrics = cleanup(lyrics) lyrics = cleanup(lyrics)
stats["avg_lines"][artist] += len(lyrics) stats["avg_lines"][artist] += len(lyrics)
stats["avg_newlines"][artist] += lyrics.count("\n") stats["avg_newlines"][artist] += lyrics.count("\n")
else:
stats["missing"][artist] += 1 doc = nlp(lyrics)
keywords.extend([token.text for token in doc if
token.is_stop != True and
token.is_punct != True and
token.pos_ in ["PRON", "NOUN", "ADJ", "VERB", "INTJ", "X"]])
# five most common noun tokens
noun_freq = Counter(keywords)
common_nouns = noun_freq.most_common(20)
final_missing = stats["missing"][artist] final_missing = stats["missing"][artist]
final_songs = len(songs) - final_missing final_songs = len(songs) - final_missing
...@@ -66,6 +99,7 @@ def analyse(): ...@@ -66,6 +99,7 @@ def analyse():
stats["avg_lines"][artist], stats["avg_lines"][artist],
stats["avg_newlines"][artist], stats["avg_newlines"][artist],
final_missing)) final_missing))
print(common_nouns)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment