feat(parles): stats words/lines, clean input

parent c36b9f70
......@@ -3,11 +3,34 @@ import os
from collections import defaultdict
def are_invalid(lyrics: str):
placeholders = ["Please check back once the song has been released",
"Lyrics for this song have yet",
"Tell us that you would like to have the lyrics of this song"]
for placeholder in placeholders:
if lyrics.find(placeholder) != -1:
return True
return False
def cleanup(lyrics):
lines = lyrics.split("\n")
filtered = [l for l in lines if
l is not None and
len(l) and
l[0] not in ["#", "["] and
"Paroles de la chanson" not in l and
"Paroles de Même" not in l]
return "\n".join(filtered)
def analyse():
path = "../data/"
files = os.listdir(path)
stats = {
"avg_length": defaultdict(lambda: 0),
"avg_lines": defaultdict(lambda: 0),
"avg_newlines": defaultdict(lambda: 0),
"missing": defaultdict(lambda: 0)
}
......@@ -20,16 +43,29 @@ def analyse():
print("\n## %s ##" % artist)
for song in songs:
title = song["title"]
# title = song["title"]
if song["lyrics"] is not None:
lyrics = song["lyrics"]
stats["avg_length"][artist] += len(lyrics)
lyrics: str = song["lyrics"]
if are_invalid(lyrics):
stats["missing"][artist] += 1
else:
lyrics = cleanup(lyrics)
stats["avg_lines"][artist] += len(lyrics)
stats["avg_newlines"][artist] += lyrics.count("\n")
else:
stats["missing"][artist] += 1
stats["avg_length"][artist] /= len(songs)
print("%s: Average song is %i long (%i missing)." % (
artist, stats["avg_length"][artist], stats["missing"][artist]))
final_missing = stats["missing"][artist]
final_songs = len(songs) - final_missing
stats["avg_lines"][artist] /= final_songs
stats["avg_newlines"][artist] /= final_songs
print("%s: %i songs, on average %i/%i long (%i missing)." % (
artist, final_songs,
stats["avg_lines"][artist],
stats["avg_newlines"][artist],
final_missing))
if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment