123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 |
- package mail
- import (
- "errors"
- "eta/eta_crawler/utils"
- "fmt"
- "io"
- "log"
- "os"
- "path"
- "regexp"
- "strings"
- "time"
- "github.com/emersion/go-imap"
- "github.com/emersion/go-imap/client"
- "github.com/emersion/go-message"
- "github.com/emersion/go-message/mail"
- "github.com/h2non/filetype"
- )
- type MailMessage struct {
- Date time.Time `description:"收件时间"`
- Uid uint32 `description:"该邮件在邮箱中的唯一id"`
- FromAddress string `description:"发件人邮箱"`
- From string `description:"发件人名称"`
- Title string `description:"邮件标题"`
- Content string `description:"邮件主体正文"`
- Resources map[string]string `description:"正文内嵌资源"`
- Attachment map[string][]byte `description:"附件资源"`
- }
- func ListenMail(mailAddress, folder, userName, password string, readBatchSize, fromEmailIndex int, mailMessageChan chan MailMessage, mailMessageDoneChan chan bool) (err error) { // 收件箱
- defer func() {
- // 处理结束
- mailMessageDoneChan <- true
- if err != nil {
- fmt.Println("err:", err.Error())
- }
- }()
- // 建立与 IMAP 服务器的连接
- c, err := client.DialTLS(mailAddress, nil)
- if err != nil {
- fmt.Printf("连接 IMAP 服务器失败: %+v \n", err)
- return
- }
- // 最后一定不要忘记退出登录
- defer func() {
- _ = c.Logout()
- }()
- // 登录
- if err = c.Login(userName, password); err != nil {
- fmt.Printf("邮箱[%s] 登录失败: %v \n", fmt.Sprintf("%s:%s", userName, mailAddress), err)
- return
- }
- // 列出当前邮箱中的文件夹
- mailboxes := make(chan *imap.MailboxInfo, 10)
- done := make(chan error, 1) // 记录错误的 chan
- go func() {
- done <- c.List("", "*", mailboxes)
- }()
- log.Println("-->当前邮箱的文件夹 Mailboxes:")
- var folderExists bool
- for m := range mailboxes {
- log.Println("* ", m.Name)
- if m.Name == folder {
- folderExists = true
- }
- }
- err = <-done
- if err != nil {
- utils.FileLog.Error("列出邮箱列表时,出现错误:%v \n", err)
- return
- }
- log.Println("-->列出邮箱列表完毕!")
- if !folderExists {
- err = fmt.Errorf(fmt.Sprintf("文件夹[%s] 不存在 \n", folder))
- return
- }
- message.CharsetReader = myCharsetReader
- // 选择指定的文件夹
- mbox, err := c.Select(folder, false)
- if err != nil {
- err = fmt.Errorf(fmt.Sprintf("选择邮件箱失败: %+v", err))
- return
- }
- log.Printf("当前文件夹[%s]中,总共有 %d 封邮件 \n", folder, mbox.Messages)
- if mbox.Messages == 0 {
- return
- }
- // 创建一个序列集,用于批量读取邮件
- seqSet := new(imap.SeqSet)
- to := mbox.Messages // 此文件下的邮件总数
- var isStopFor bool
- step := uint32(1)
- for i := to; i >= 1; {
- start := i - step + 1
- if start < 0 {
- start = 1
- }
- seqSet.Clear()
- seqSet.AddRange(start, i) // 添加指定范围内的邮件编号
- // 获取整个消息正文
- // imap.FetchEnvelope:请求获取邮件的信封数据(例如发件人、收件人、主题等元数据)。
- // imap.FetchRFC822:请求获取完整的邮件内容,包括所有头部和正文。
- items := []imap.FetchItem{imap.FetchFlags, imap.FetchEnvelope, imap.FetchRFC822, imap.FetchBodyStructure}
- // 获取邮件内容 Start
- messages := make(chan *imap.Message, readBatchSize) // 创建一个通道,用于接收邮件消息
- fetchDone := make(chan error, 1) // 创建一个通道,用于接收错误消息
- go func() {
- // Fetch方法用于从服务器获取邮件数据,这里请求了邮件的信封和完整内容
- fetchDone <- c.Fetch(seqSet, items, messages)
- }()
- err = <-fetchDone
- if err != nil {
- utils.FileLog.Error("获取邮件信息出现错误:%v \n", err)
- return
- }
- // 获取邮件内容 End
- for msg := range messages {
- // 如果需要终止,那么就不处理了
- if isStopFor {
- continue
- }
- emailMessage, isRead, tmpErr := readEveryMsg(msg)
- if tmpErr != nil {
- // 移除本地文件
- {
- for _, v := range emailMessage.Resources {
- os.Remove(v)
- }
- }
- utils.FileLog.Error("读取邮件内容时出现错误:%v \n", tmpErr)
- continue
- }
- // 如果没有取到,那么就过滤
- if !isRead {
- continue
- }
- // 判断当前邮件id是否小于等于已经监听到的最小id,如果是,那么就不处理了
- if emailMessage.Uid <= uint32(fromEmailIndex) {
- isStopFor = true
- continue
- }
- // 如果取到了,那么写入待处理chan
- // 写入邮件处理chan
- mailMessageChan <- emailMessage
- }
- if isStopFor {
- // 已经找到了最小的邮件id,那么就退出循环了
- }
- i = i - step
- }
- log.Println("读取了所有邮件,完毕!")
- return
- }
- // document link: https://github.com/emersion/go-imap/wiki/Fetching-messages
- func readEveryMsg(msg *imap.Message) (emailMessage MailMessage, ok bool, err error) {
- ok = true
- defer func() {
- if err != nil {
- ok = false
- utils.FileLog.Error("邮件读取失败;Err:%s", err.Error())
- }
- }()
- message.CharsetReader = myCharsetReader
- emailMessage.Resources = make(map[string]string) // 内嵌资源
- emailMessage.Attachment = make(map[string][]byte) // 附件
- emailMessage.Uid = msg.Uid
- htmlStr := ``
- textStr := ``
- // 获取邮件正文
- r := msg.GetBody(&imap.BodySectionName{})
- if r == nil {
- utils.FileLog.Info("服务器没有返回消息内容")
- }
- mr, err := mail.CreateReader(r)
- if err != nil {
- err = errors.New(fmt.Sprintf("邮件读取时出现错误:%v \n", err))
- return
- }
- // 收件时间
- {
- date, err := mr.Header.Date()
- if err != nil {
- log.Println("收件时间 异常:", err.Error())
- }
- emailMessage.Date = date
- }
- // 发件人
- {
- fromStr := mr.Header.Get("From")
- // 处理无效地址的情况
- if !strings.Contains(fromStr, "@") {
- emailMessage.FromAddress = fromStr
- emailMessage.From = fromStr
- } else {
- from, tmpErr := mr.Header.AddressList("From")
- if tmpErr != nil {
- log.Println("发件人 异常:", err.Error())
- }
- if len(from) > 0 {
- emailMessage.FromAddress = from[0].Address
- emailMessage.From = from[0].Name
- }
- }
- }
- // 邮件标题
- subject, err := mr.Header.Subject()
- if err != nil {
- log.Println("邮件主题 Subject ERR:", err)
- } else {
- //log.Println("邮件主题 Subject:", subject)
- }
- emailMessage.Title = subject
- // 过滤
- for {
- p, tmpErr := mr.NextPart()
- if tmpErr == io.EOF {
- break
- } else if tmpErr != nil {
- utils.FileLog.Error("读取邮件内容时出现错误:%v \n", tmpErr)
- err = tmpErr
- return
- }
- bodyBytes, _ := io.ReadAll(p.Body)
- if err != nil {
- //log.Fatalf("读取邮件部分时出现错误:%v \n", err)
- err = errors.New(fmt.Sprintf("读取邮件部分时出现错误:%v \n", err))
- return
- }
- switch h := p.Header.(type) {
- case *mail.InlineHeader:
- // 这是消息的文本(可以是纯文本或 HTML)
- contentType := h.Get("Content-Type")
- //log.Println("消息内容content-type:", contentType)
- if strings.HasPrefix(contentType, "text/plain") {
- //log.Printf("得到正文 -> TEXT: %v \n", string(bodyBytes))
- textStr += string(bodyBytes)
- } else if strings.HasPrefix(contentType, "text/html") {
- //log.Printf("得到正文 -> HTML: %v \n", len(b))
- //log.Printf("得到正文 -> HTML: %v \n", string(bodyBytes))
- htmlStr += string(bodyBytes)
- }
- // 这是内嵌资源
- if cid := p.Header.Get("Content-ID"); cid != "" {
- // 确定文件后缀
- fileSuffix := determineFileSuffix(bodyBytes)
- fileName := fmt.Sprintf("%s%s.%s", utils.MtjhFilePath, cid[1:len(cid)-1], fileSuffix)
- err = SaveToFile(bodyBytes, fileName)
- if err != nil {
- err = errors.New(fmt.Sprintf("保存文件时出现错误:%v \n", err))
- return
- }
- emailMessage.Resources[cid] = fileName
- }
- case *mail.AttachmentHeader:
- // 这是一个附件
- filename, _ := h.Filename()
- //log.Printf("得到附件: %v,content-type:%s \n", filename, p.Header.Get("Content-Type"))
- saveName := fmt.Sprint(msg.SeqNum, utils.MD5(filename), time.Now().Format(utils.FormatDateTimeUnSpace), time.Now().Nanosecond(), path.Ext(filename))
- filePath := fmt.Sprintf("%s%s%s%s", utils.MtjhFilePath, `file`, string(os.PathSeparator), saveName)
- err = SaveToFile(bodyBytes, filePath)
- if err != nil {
- err = errors.New(fmt.Sprintf("保存文件时出现错误:%v \n", err))
- return
- }
- // 这是附件资源
- if contentDisposition := p.Header.Get("Content-Disposition"); contentDisposition != "" {
- if strings.HasPrefix(contentDisposition, "attachment") {
- emailMessage.Attachment[filename] = bodyBytes
- }
- } else if cid := p.Header.Get("Content-ID"); cid != "" {
- // 这是内嵌资源
- emailMessage.Resources[cid] = filePath
- }
- //else {
- // mailMessage.Attachment[filename] = filePath
- //}
- default:
- utils.FileLog.Info("未知格式:", h)
- //log.Println(h)
- }
- }
- emailMessage.Content = htmlStr
- if emailMessage.Content == `` {
- emailMessage.Content = textStr
- }
- //log.Println("一封邮件读取完毕")
- //log.Printf("------------------------- \n\n")
- return
- }
- // 根据文件内容确定文件后缀
- func determineFileSuffix(content []byte) string {
- kind, err := filetype.Match(content)
- if err != nil {
- utils.FileLog.Error("无法确定文件类型:%v \n", err)
- return ".bin"
- }
- return kind.Extension
- }
- func SaveToFile(content []byte, fileName string) error {
- file, err := os.Create(fileName)
- if err != nil {
- return err
- }
- defer func() {
- _ = file.Close()
- }()
- _, err = file.Write(content)
- if err != nil {
- return err
- }
- return nil
- }
- // ContainsWholeWord 检查字符串 s 中是否包含完整的单词 word。
- // 该函数使用正则表达式来匹配整个单词,确保不会错误地匹配到单词的一部分。
- // 参数:
- //
- // s: 要搜索的字符串
- // word: 要查找的完整单词
- //
- // 返回值:
- //
- // 如果 s 中包含完整的单词 word,则返回 true;否则返回 false。
- func ContainsWholeWord(s string, word string) bool {
- pattern := fmt.Sprintf(`\b%s\b`, regexp.QuoteMeta(word))
- re := regexp.MustCompile(pattern)
- return re.MatchString(s)
- }
|