2002
@inproceedings{YpH2002,
vgclass = {refpap},
author = {Alexander Ypma and Tom Heskes},
title = {Categorization of web pages and user clustering with
mixtures of hidden {M}arkov models},
booktitle = {Proceedings of the International Workshop on Web Knowledge
Discovery and Data mining (WEBKDD'02)},
address = {Edmonton, Canada},
month = {July~17},
year = {2002},
url = {ftp://ftp.mbfys.kun.nl/pub/snn/pub/ypma/Pdf/webkdd02.pdf},
url1 = {ftp://ftp.mbfys.kun.nl/pub/snn/pub/ypma/PostScript/webkdd02.ps},
abstract = {We propose mixtures of hidden Markov models for modelling
clickstreams of web surfers. Hence, the page categorization is learned
from the data without the need for a (possibly cumbersome) manual
categorization. We provide an EM algorithm for training a mixture of
HMMs and show that additional static user data can be incorporated
easily to possibly enhance the labelling of users. Furthermore, we use
prior knowledge to enhance generalization and avoid numerical problems.
We use parameter tying to decrease the danger of over tting and to
reduce computational overhead. We put a at prior on the parameters to
deal with the problem that certain transitions between page categories
occur very seldom or not at all, in order to ensure that a nonzero
transition probability between these categories nonetheless remains. In
applications to arti cial data and real-world web logs we demonstrate
the usefulness of our approach. We train a mixture of HMMs on arti cial
navigation patterns, and show that the correct model is being learned.
Moreover, we show that the use of static satellite data may enhance
the labeling of shorter navigation patterns. When applying a mixture of
HMMs to realworld web logs from a large Dutch commercial web site, we
demonstrate that sensible page categorizations are being learned.},
}