/* You have access log from a host serving Amazon.com traffic, with information about request timestamps, UserIdentifiers, the product page url and Status Code of the request. ... access.log
02 Mar 2024 21:55:05,379 User:user1 URL:canonT2i.html StatusCode:200 02 Mar 2024 21:55:06,379 User:user2 URL:nikonD5000.html StatusCode:200 02 Mar 2024 21:55:07,379 User:user1 URL:tripod.html StatusCode:200 02 Mar 2024 21:55:07,380 User:user3 URL:canonT2i.html StatusCode:200 02 Mar 2024 21:55:07,381 User:user2 URL:tripod.html StatusCode:200 02 Mar 2024 21:55:08,381 User:user3 URL:tripod.html StatusCode:200 ... Given such an access log, you have to find the most popular page sequence consisting of 2 pages across all users. In this sample log, you see that the most popular page sequence is canonT2i.html -> tripod.html (2)
User 2: nikonD5000.html -> tripod.html (1) This was the page sequence that was visited by user1 and user3. */
def findMostpopularSequence(file): access_map ={}
with open(file, 'r') as f:
line = f.readline()
info_array = line.split(" ").strip()
user = info_array[1].split(':')[1]
url = infor_array[2].split(':')[1]
access_map.get(user,[]).append(url) # {user1: [canonT2i, tripod] , user2: [...]}
# tuple
# (cannon, tripod) : count +1
#
count_map = {}
for i,j in access_map.items():
key,value = j
count_map.get((key,value),0) += 1
# return based on reverse count
popular_list = sorted(count_map.items(), key = lambda x: x[1], reverse =True)
return popular_list[0][0]