Just 做 IT

求知若饥 虚心若愚 안년하세요 (•‾̑⌣‾̑•)

User-CF

1 year ago 0


# encoding:utf-8
__author__ = 'long'
import random
import math
import operator


def split_data(data, m, k, seed):
    """
    每次实验选取不同的 k ( 0 ≤ k ≤ M-1)和相同的随机数种子 seed, 进行 M 次实验就可
    以得到 M 个不同的训练集和测试集,然后分别进行实验,用M次实验的平均值作为最后的评测指
    标
    :param data:
    :param m:
    :param k: k(0 ≤ k ≤ M-1)
    :param seed:
    :return:
    """

    test = []
    test_dict = {}
    train = []
    train_dict = {}
    random.seed(seed)
    for user, movie in data:
        if random.randint(0, m) == k:
        # if False:
            test.append([user, movie])
            try:
                test_dict[user].add(movie)
            except:
                s = set()
                s.add(movie)
                test_dict[user] = s
        else:
            train.append([user, movie])
            try:
                train_dict[user].add(movie)
            except:
                s = set()
                s.add(movie)
                train_dict[user] = s
    return train, test, train_dict, test_dict


def user_cf(u, i, K = 5, train={}):
    """
    UserCF 算法会给用户推荐和他兴趣最相似的 K 个用户喜欢的物品。
    如下的公式度量了 UserCF 算法中用户 u 对物品 i 的感兴趣程度:
    p(u, i) =     ∑                  W[u][v]*r[r][i]
             v ∈ S(u, K)∩N(i)
    :param u: 用户ID
    :param i: 物品ID
    :param w: 两两用户的感兴趣程度 w[u][v] = w[v][u]
    :param K:S(u, K) 包含和用户 u 兴趣最接近的 K 个用户
    :param item_users: 对某个物品有过行为的用户集合
    :return:
    """
    w, item_users = UserSimilarityOptimized2(train)
    try:
        N = item_users["%s" % i]   #对这个物品有过行为的用户
    except KeyError:
        return 0.0
    S = set()             #相似用户
    p = 0
    for v, wuv in sorted(w["%s" % u].items(), key=operator.itemgetter(1), reverse=True)[0:K]:
        S.add(v)
        if v not in N:
            continue
        p += wuv
    return p


def Recommend(u, train={}, K=5):
    W, _ = UserSimilarityOptimized2(train)
    rank = dict()
    interacted_items = train["%s" % u]
    for v, wuv in sorted(W["%s" % u].items(), key=operator.itemgetter(1), reverse=True)[0:K]:
        # print v, wuv
        for i in train[v]:
            if i in interacted_items:
                continue
            if i not in rank:
                rank[i] = 0
            rank[i] += wuv * 1
    return rank


def Recall(train, test, N, K):
    """
                    ∑  |R(u) ∩ T(u)|
                    u∈U
    Recall =  ————————————————————————
                     ∑  |T(u)|
                     u∈U
    recall rate 召回率
    :param train:
    :param test:
    :param N:
    :return: 召回率
    """
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        # u, train={}, K=5
        rank = Recommend(user, train=train, K=K)
        rank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:N]
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)


def Precision(train, test, N, K):
    """
                     ∑  |R(u) ∩ T(u)|
                    u∈U
    Precision =  ————————————————————————
                     ∑  |R(u)|
                     u∈U

    precision rate 准确率
    :param train:训练集
    :param test:测试集
    :param N:一般会选取不同的推荐列表长度 N ,
    :return: 准确率
    """
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        # u, train={}, K=5
        rank = Recommend(user, train=train, K=K)
        rank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:N]
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


def Coverage(train, test, N, K):
    """
    该覆盖率表示最终的推荐列表中包含多大比例的物品。如果所有的物品都被推荐给至少一个
用户,那么覆盖率就是 100%
    Coverage Rate 覆盖率
    :param train: 训练集
    :param test: 测试集
    :param N: 推荐长度为N
    :return: 覆盖率
    """
    recommend_items = set()
    all_items = set()
    for user in train.keys():
        for item in train[user]:
            all_items.add(item)
            rank = Recommend(user, train=train, K=K)
            rank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:N]
            for item, pui in rank:
                recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)


def Popularity(train, test, N, K):
    """
    Popularity Rate
    最后,我们还需要评测推荐的新颖度,这里用推荐列表中物品的平均流行度度量推荐结果的
新颖度。如果推荐出的物品都很热门,说明推荐的新颖度较低,否则说明推荐结果比较新颖。
    :param train:
    :param test:
    :param N:
    :param K:
    :return:
    """
    item_popularity = dict()
    for user, items in train.items():
        for item in items:
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    #概率大,出现机会多,不确定性小;反之就大。
    for user in train.keys():
        rank = Recommend(user, train=train, K=K)
        rank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:N]
        for item, pui in rank:
            ret += math.log(1 + item_popularity[item])
            n += 1
    ret /= n * 1.0
    return ret


def UserSimilarity(train):
    W = dict()
    for u in train.keys():
        for v in train.keys():
            if u == v:
                continue
            print u, v
            try:
                W[u][v]=0
            except:
                W[u] = {}
                W[u][v] = {}
            W[u][v] = len(train[u] & train[v])
            W[u][v] /= math.sqrt(len(train[u]) * len(train[v]) * 1.0)
    return W


def UserSimilarityOptimized1(train):
    """
    :param train:
    :return:
    """
    #build inverse table for item_users

    item_users = dict()
    for u, items in train.items():
        for i in items:
            if i not in item_users:
                item_users[i] = set()
            item_users[i].add(u)
            # print i
            # print u
    #calculate co-rated items between users
    C = dict()
    N = dict()
    # print item_users["3688"]
    for i, users in item_users.items():

        for u in users:
            try:
                N[u] += 1
            except KeyError:
                N[u] = 1

            for v in users:
                if u == v:
                    continue
                if u not in C:
                    C[u] = {}
                if v not in C[u]:
                    C[u][v] = 1
                else:
                    C[u][v] += 1
    #calculate finial similarity matrix W
    W = dict()
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            if u not in W:
                W[u] = {}
            if v not in W[u]:
                C[u][v] = 0
            W[u][v] = cuv / math.sqrt(N[u] * N[v])
    return W, item_users


def UserSimilarityOptimized2(train):
    # build inverse table for item_users
    item_users = dict()
    for u, items in train.items():
        for i in items:
            if i not in item_users:
                item_users[i] = set()
            item_users[i].add(u)
    #calculate co-rated items between users
    C = dict()
    N = dict()
    for i, users in item_users.items():
        for u in users:
            try:
                N[u] += 1
            except KeyError:
                N[u] = 1
            for v in users:
                if u == v:
                    continue
                if u not in C:
                    C[u] = {}
                if v not in C[u]:
                    C[u][v] = 0
                C[u][v] += 1 / math.log(1 + len(users))
    #calculate finial similarity matrix W
    W = dict()
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            if u not in W:
                W[u] = {}
            if v not in W[u]:
                C[u][v] = -1
            W[u][v] = cuv / math.sqrt(N[u] * N[v])
    return W, item_users


def test1():
    train_dict = {"A": set(['a', 'b', 'd']),
    "B": set(['a', 'c']),
    "C": set(['b', 'e']),
    "D": set(['c', 'd', 'e'])}

    p = user_cf("A", 'c', 3, train_dict)
    print p
    p = user_cf("A", 'e', 3, train_dict)
    print p

if __name__ == '__main__':
    f = open("rt", "r")
    data = []
    for line in f.readlines():
        item = line.split("::")
        data.append([item[0], item[1]])
    m = 5
    train, test, train_dict, test_dict = split_data(data, 5, 1, 1)
    print len(train), len(test), len(train_dict.keys()), len(test_dict)
    w, item_users = UserSimilarityOptimized2(train_dict)
    # print item_users['3688'], w['10']['26']
    print "start recommend"
    # p = user_cf(10, 3688, 20, train_dict)
    # print p

    # rank = Recommend(1, train_dict, 10)
    # # print rank.items()
    # rank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[0:2]
    # print rank
    N =10
    K = 10
    for K in (2, 4):
        precision = Precision(train_dict, test_dict, N=N, K=K)
        recall = Recall(train_dict, test_dict, N=N, K=K)
        coverage = Coverage(train_dict, test_dict, N=N, K=K)
        popularity = Popularity(train_dict, test_dict, N=N, K=K)
        print K, "Precision:", precision*100, "Recall:", recall*100, "Coverage:", coverage*100, "Popularity:", popularity*100

# 用户满意度,预测准确度,覆盖率,多样性,新颖性,惊喜度,信任度,实时性,健壮性,商业目的
#           离线实验 问卷调查 在线实验
# 用户满意度    ×       √       ○
# 预测准确度    √       √       ×
# 覆盖率       √       √        √
# 多样性       ○       √        ○
# 新颖性       ○       √        ○
# 惊喜度       ×       √        ×

Write a Comment