You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.
 
 
 
 
 
 

74 lines
1.5 KiB

  1. import sys
  2. def clean_fragment(fragment):
  3. result = ""
  4. for c in fragment:
  5. if c.isalpha() or c in ["-", "'"]:
  6. result += c
  7. return result
  8. def split_words(text):
  9. fragments = split_fragments(text)
  10. res = list()
  11. for fragment in fragments:
  12. fragment = fragment.lower()
  13. fragment = clean_fragment(fragment)
  14. if fragment:
  15. res.append(fragment)
  16. return res
  17. def split_fragments(text):
  18. res = list()
  19. for fragment in text.split():
  20. if "’" in fragment:
  21. before = fragment.split("’")[0]
  22. after = fragment.split("’")[1]
  23. res.append(before)
  24. res.append(after)
  25. else:
  26. res.append(fragment)
  27. return res
  28. def get_frequencies(words):
  29. res = dict()
  30. for word in words:
  31. if word in res:
  32. res[word] += 1
  33. else:
  34. res[word] = 1
  35. return res
  36. def get_scores(frequencies):
  37. res = list()
  38. for word, count in frequencies.items():
  39. res.append((count, word))
  40. res.sort(reverse=True)
  41. return res
  42. def print_scores(scores):
  43. for count, word in scores:
  44. print(count, word)
  45. def main():
  46. if len(sys.argv) < 2:
  47. sys.exit("not enough arguments")
  48. filename = sys.argv[1]
  49. file = open(filename)
  50. contents = file.read()
  51. words = split_words(contents)
  52. frequencies = get_frequencies(words)
  53. scores = get_scores(frequencies)
  54. top_words = scores[:20]
  55. print_scores(top_words)
  56. main()