feat(parles): stats words/lines, clean input

parent c36b9f70
...@@ -3,11 +3,34 @@ import os ...@@ -3,11 +3,34 @@ import os
from collections import defaultdict from collections import defaultdict
def are_invalid(lyrics: str):
placeholders = ["Please check back once the song has been released",
"Lyrics for this song have yet",
"Tell us that you would like to have the lyrics of this song"]
for placeholder in placeholders:
if lyrics.find(placeholder) != -1:
return True
return False
def cleanup(lyrics):
lines = lyrics.split("\n")
filtered = [l for l in lines if
l is not None and
len(l) and
l[0] not in ["#", "["] and
"Paroles de la chanson" not in l and
"Paroles de Même" not in l]
return "\n".join(filtered)
def analyse(): def analyse():
path = "../data/" path = "../data/"
files = os.listdir(path) files = os.listdir(path)
stats = { stats = {
"avg_length": defaultdict(lambda: 0), "avg_lines": defaultdict(lambda: 0),
"avg_newlines": defaultdict(lambda: 0),
"missing": defaultdict(lambda: 0) "missing": defaultdict(lambda: 0)
} }
...@@ -20,16 +43,29 @@ def analyse(): ...@@ -20,16 +43,29 @@ def analyse():
print("\n## %s ##" % artist) print("\n## %s ##" % artist)
for song in songs: for song in songs:
title = song["title"] # title = song["title"]
if song["lyrics"] is not None: if song["lyrics"] is not None:
lyrics = song["lyrics"] lyrics: str = song["lyrics"]
stats["avg_length"][artist] += len(lyrics) if are_invalid(lyrics):
stats["missing"][artist] += 1
else:
lyrics = cleanup(lyrics)
stats["avg_lines"][artist] += len(lyrics)
stats["avg_newlines"][artist] += lyrics.count("\n")
else: else:
stats["missing"][artist] += 1 stats["missing"][artist] += 1
stats["avg_length"][artist] /= len(songs) final_missing = stats["missing"][artist]
print("%s: Average song is %i long (%i missing)." % ( final_songs = len(songs) - final_missing
artist, stats["avg_length"][artist], stats["missing"][artist]))
stats["avg_lines"][artist] /= final_songs
stats["avg_newlines"][artist] /= final_songs
print("%s: %i songs, on average %i/%i long (%i missing)." % (
artist, final_songs,
stats["avg_lines"][artist],
stats["avg_newlines"][artist],
final_missing))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment