feat(parles): Prototyping

82eee4b4 · PLN (Algolia) · 16f01706 · 82eee4b4 · 82eee4b4
Unverified Commit 82eee4b4 authored Nov 24, 2019 by PLN (Algolia)
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 7 deletions

parles.py CommentTuParles/parles.py +41 -7

requirements.txt CommentTuParles/requirements.txt +1 -0

No files found.
--- a/CommentTuParles/parles.py
+++ b/CommentTuParles/parles.py
 import json
+import operator
 import os
-from collections import defaultdict
+from collections import defaultdict, Counter
+
+import spacy


 def are_invalid(lyrics: str):
@@ -16,8 +19,7 @@ def are_invalid(lyrics: str):
 def cleanup(lyrics):
    lines = lyrics.split("\n")
    filtered = [l for l in lines if
-                l is not None and
-                len(l) and
+                l is not None and len(l) and
                l[0] not in ["#", "["] and
                "Paroles de la chanson" not in l and
                "Paroles de Même" not in l]
@@ -27,33 +29,64 @@ def cleanup(lyrics):

 def analyse():
    path = "../data/"
-    files = os.listdir(path)
    stats = {
        "avg_lines": defaultdict(lambda: 0),
        "avg_newlines": defaultdict(lambda: 0),
        "missing": defaultdict(lambda: 0)
    }

+    print("Go")
+    files = os.listdir(path)
+    print("Listed data files.")
+    nlp = spacy.load("fr_core_news_md")
+    print("Loaded spaCy.")
+
    for filename in files:
        with open(path + filename) as f:
            content = json.load(f)
+            print("Loaded", filename)
            artist = content["name"]
            songs = content["songs"]

            print("\n## %s ##" % artist)
+            keywords = []

+            occurences = {}
            for song in songs:
-                # title = song["title"]
                if song["lyrics"] is not None:
+                    keyword = "anar"
+                    occurences[song["title"]] = song["lyrics"].lower().count(keyword)
+
+            if any([i > 0 for i in occurences.values()]):
+                print("%s chez %s: %i" % (keyword, artist, len(occurences)))
+                for pair in sorted(occurences.items(), key=operator.itemgetter(1), reverse=True):
+                    print(pair)
+            else:
+                print("%s ne parle pas de %s" % (artist, keyword))
+
+            for song in songs:
+                # title = song["title"]
+                if song["lyrics"] is None:
+                    stats["missing"][artist] += 1
+                else:
                    lyrics: str = song["lyrics"]
+
                    if are_invalid(lyrics):
                        stats["missing"][artist] += 1
                    else:
                        lyrics = cleanup(lyrics)
                        stats["avg_lines"][artist] += len(lyrics)
                        stats["avg_newlines"][artist] += lyrics.count("\n")
-                else:
-                    stats["missing"][artist] += 1
+
+                        doc = nlp(lyrics)
+                        keywords.extend([token.text for token in doc if
+                                         token.is_stop != True and
+                                         token.is_punct != True and
+                                         token.pos_ in ["PRON", "NOUN", "ADJ", "VERB", "INTJ", "X"]])
+
+            # five most common noun tokens
+            noun_freq = Counter(keywords)
+            common_nouns = noun_freq.most_common(20)

            final_missing = stats["missing"][artist]
            final_songs = len(songs) - final_missing
@@ -66,6 +99,7 @@ def analyse():
                stats["avg_lines"][artist],
                stats["avg_newlines"][artist],
                final_missing))
+            print(common_nouns)


 if __name__ == '__main__':

--- a/CommentTuParles/requirements.txt
+++ b/CommentTuParles/requirements.txt
+spacy==2.2.3